Skip to content

Commit 50551f6

Browse files
committed
Merge branch 'main' of github.com:triton-inference-server/server into yinggeh-DLIS-7061-add-vllm-metrics
2 parents 6f601f4 + 5320009 commit 50551f6

File tree

2 files changed

+19
-48
lines changed

2 files changed

+19
-48
lines changed

qa/L0_perf_tensorrt_llm/test.sh

Lines changed: 18 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ TRT_ROOT="/usr/local/tensorrt"
3434
MODEL_NAME="gpt2_tensorrt_llm"
3535
NAME="tensorrt_llm_benchmarking_test"
3636
MODEL_REPOSITORY="$(pwd)/triton_model_repo"
37-
TENSORRTLLM_BACKEND_DIR="/opt/tritonserver/tensorrtllm_backend"
37+
TENSORRTLLM_BACKEND_DIR="/workspace/tensorrtllm_backend"
3838
GPT_DIR="$TENSORRTLLM_BACKEND_DIR/tensorrt_llm/examples/gpt"
3939
TOKENIZER_DIR="$GPT_DIR/gpt2"
4040
ENGINES_DIR="${BASE_DIR}/engines/inflight_batcher_llm/${NUM_GPUS}-gpu"
@@ -47,40 +47,27 @@ SERVER_TIMEOUT=${SERVER_TIMEOUT:=120}
4747
function clone_tensorrt_llm_backend_repo {
4848
rm -rf $TENSORRTLLM_BACKEND_DIR && mkdir $TENSORRTLLM_BACKEND_DIR
4949
apt-get update && apt-get install git-lfs -y --no-install-recommends
50-
git clone --single-branch --depth=1 -b ${TENSORRTLLM_BACKEND_REPO_TAG} https://github.com/triton-inference-server/tensorrtllm_backend.git $TENSORRTLLM_BACKEND_DIR
50+
git clone --single-branch --depth=1 -b ${TENSORRTLLM_BACKEND_REPO_TAG} ${TRITON_REPO_ORG}/tensorrtllm_backend.git $TENSORRTLLM_BACKEND_DIR
5151
cd $TENSORRTLLM_BACKEND_DIR && git lfs install && git submodule update --init --recursive
5252
}
5353

5454
# Update Open MPI to a version compatible with SLURM.
5555
function upgrade_openmpi {
56-
cd /tmp/
5756
local CURRENT_VERSION=$(mpirun --version 2>&1 | awk '/Open MPI/ {gsub(/rc[0-9]+/, "", $NF); print $NF}')
5857

5958
if [ -n "$CURRENT_VERSION" ] && dpkg --compare-versions "$CURRENT_VERSION" lt "5.0.1"; then
6059
# Uninstall the current version of Open MPI
61-
wget "https://download.open-mpi.org/release/open-mpi/v$(echo "${CURRENT_VERSION}" | awk -F. '{print $1"."$2}')/openmpi-${CURRENT_VERSION}.tar.gz" || {
62-
echo "Failed to download Open MPI ${CURRENT_VERSION}"
63-
exit 1
64-
}
65-
rm -rf "openmpi-${CURRENT_VERSION}" && tar -xzf "openmpi-${CURRENT_VERSION}.tar.gz" && cd "openmpi-${CURRENT_VERSION}" || {
66-
echo "Failed to extract Open MPI ${CURRENT_VERSION}"
67-
exit 1
68-
}
69-
unset PMIX_VERSION && ./configure --prefix=/opt/hpcx/ompi/ && make uninstall || {
70-
echo "Failed to uninstall Open MPI ${CURRENT_VERSION}"
71-
exit 1
72-
}
73-
rm -rf /opt/hpcx/ompi/ /usr/local/mpi/ || {
74-
echo "Failed to remove Open MPI ${CURRENT_VERSION} installation directories"
60+
rm -r /opt/hpcx/ompi/ /usr/local/mpi && rm -rf /usr/lib/$(gcc -print-multiarch)/openmpi || {
61+
echo "Failed to uninstall the existing Open MPI version $CURRENT_VERSION."
7562
exit 1
7663
}
77-
cd ../ && rm -r openmpi-${CURRENT_VERSION}
7864
else
79-
echo "Installed Open MPI version is not less than 5.0.1. Skipping the upgrade."
65+
echo "The installed Open MPI version ($CURRENT_VERSION) is 5.0.1 or higher. Skipping the upgrade."
8066
return
8167
fi
8268

8369
# Install SLURM supported Open MPI version
70+
cd /tmp/
8471
wget "https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-5.0.1.tar.gz" || {
8572
echo "Failed to download Open MPI 5.0.1"
8673
exit 1
@@ -108,18 +95,6 @@ function upgrade_openmpi {
10895
mpirun --version
10996
}
11097

111-
function install_tensorrt_llm {
112-
# Install CMake
113-
bash ${TENSORRTLLM_BACKEND_DIR}/tensorrt_llm/docker/common/install_cmake.sh
114-
export PATH="/usr/local/cmake/bin:${PATH}"
115-
116-
TORCH_INSTALL_TYPE="pypi" &&
117-
(cd ${TENSORRTLLM_BACKEND_DIR}/tensorrt_llm &&
118-
bash docker/common/install_pytorch.sh $TORCH_INSTALL_TYPE &&
119-
python3 ./scripts/build_wheel.py --trt_root=/usr/local/tensorrt &&
120-
pip3 install ./build/tensorrt_llm*.whl)
121-
}
122-
12398
function build_gpt2_base_model {
12499
# Download weights from HuggingFace Transformers
125100
cd ${GPT_DIR} && rm -rf gpt2 && git clone https://huggingface.co/gpt2-medium gpt2 && cd gpt2
@@ -131,24 +106,21 @@ function build_gpt2_base_model {
131106
cd ${GPT_DIR}
132107

133108
# Convert weights from HF Tranformers to FT format
134-
python3 hf_gpt_convert.py -p 1 -i gpt2 -o ./c-model/gpt2 --tensor-parallelism ${NUM_GPUS} --storage-type float16
109+
python3 convert_checkpoint.py --model_dir gpt2 --dtype float16 --tp_size ${NUM_GPUS} --output_dir "./c-model/gpt2/${NUM_GPUS}-gpu/"
135110
cd ${BASE_DIR}
136111
}
137112

138113
function build_gpt2_tensorrt_engine {
139114
# Build TensorRT engines
140115
cd ${GPT_DIR}
141-
python3 build.py --model_dir="./c-model/gpt2/${NUM_GPUS}-gpu/" \
142-
--world_size="${NUM_GPUS}" \
143-
--dtype float16 \
144-
--use_inflight_batching \
145-
--use_gpt_attention_plugin float16 \
146-
--paged_kv_cache \
147-
--use_gemm_plugin float16 \
148-
--remove_input_padding \
149-
--hidden_act gelu \
150-
--parallel_build \
151-
--output_dir="${ENGINES_DIR}"
116+
trtllm-build --checkpoint_dir "./c-model/gpt2/${NUM_GPUS}-gpu/" \
117+
--gpt_attention_plugin float16 \
118+
--remove_input_padding enable \
119+
--paged_kv_cache enable \
120+
--gemm_plugin float16 \
121+
--workers "${NUM_GPUS}" \
122+
--output_dir "${ENGINES_DIR}"
123+
152124
cd ${BASE_DIR}
153125
}
154126

@@ -172,18 +144,18 @@ function prepare_model_repository {
172144
replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/preprocessing/config.pbtxt"
173145
replace_config_tags '${preprocessing_instance_count}' '1' "${MODEL_REPOSITORY}/preprocessing/config.pbtxt"
174146
replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_REPOSITORY}/preprocessing/config.pbtxt"
175-
replace_config_tags '${tokenizer_type}' 'auto' "${MODEL_REPOSITORY}/preprocessing/config.pbtxt"
176147

177148
replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/postprocessing/config.pbtxt"
178149
replace_config_tags '${postprocessing_instance_count}' '1' "${MODEL_REPOSITORY}/postprocessing/config.pbtxt"
179150
replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_REPOSITORY}/postprocessing/config.pbtxt"
180-
replace_config_tags '${tokenizer_type}' 'auto' "${MODEL_REPOSITORY}/postprocessing/config.pbtxt"
181151

182152
replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
183153
replace_config_tags '${decoupled_mode}' 'true' "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
184154
replace_config_tags '${max_queue_delay_microseconds}' "1000000" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
185155
replace_config_tags '${batching_strategy}' 'inflight_fused_batching' "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
186156
replace_config_tags '${engine_dir}' "${ENGINES_DIR}" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
157+
replace_config_tags '${triton_backend}' "tensorrtllm" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
158+
replace_config_tags '${max_queue_size}' "0" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
187159
}
188160

189161
# Wait until server health endpoint shows ready. Sets WAIT_RET to 0 on
@@ -244,13 +216,12 @@ function kill_server {
244216

245217
upgrade_openmpi
246218
clone_tensorrt_llm_backend_repo
247-
install_tensorrt_llm
248219
build_gpt2_base_model
249220
build_gpt2_tensorrt_engine
250221
prepare_model_repository
251222

252223
# Install perf_analyzer
253-
pip3 install tritonclient nvidia-ml-py3
224+
pip3 install tritonclient
254225

255226
ARCH="amd64"
256227
STATIC_BATCH=1

qa/L0_perf_vllm/test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ SERVER_ARGS="--model-repository=${MODEL_REPO} --backend-directory=${BACKEND_DIR}
4141
export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:=0}
4242
EXPORT_FILE=profile-export-vllm-model.json
4343

44-
pip3 install tritonclient nvidia-ml-py3
44+
pip3 install tritonclient
4545
rm -rf $MODEL_REPO $EXPORT_FILE *.tjson *.json *.csv
4646

4747
mkdir -p $MODEL_REPO/$MODEL_NAME/1

0 commit comments

Comments
 (0)