Merge branch 'main' of github.com:triton-inference-server/server into yinggeh-DLIS-7061-add-vllm-metrics

yinggeh · yinggeh · commit 50551f6d943f · 2024-08-15T01:48:16.000-07:00
diff --git a/qa/L0_perf_tensorrt_llm/test.sh b/qa/L0_perf_tensorrt_llm/test.sh
@@ -34,7 +34,7 @@ TRT_ROOT="/usr/local/tensorrt"
 MODEL_NAME="gpt2_tensorrt_llm"
 NAME="tensorrt_llm_benchmarking_test"
 MODEL_REPOSITORY="$(pwd)/triton_model_repo"
-TENSORRTLLM_BACKEND_DIR="/opt/tritonserver/tensorrtllm_backend"
+TENSORRTLLM_BACKEND_DIR="/workspace/tensorrtllm_backend"
 GPT_DIR="$TENSORRTLLM_BACKEND_DIR/tensorrt_llm/examples/gpt"
 TOKENIZER_DIR="$GPT_DIR/gpt2"
 ENGINES_DIR="${BASE_DIR}/engines/inflight_batcher_llm/${NUM_GPUS}-gpu"
@@ -47,40 +47,27 @@ SERVER_TIMEOUT=${SERVER_TIMEOUT:=120}
 function clone_tensorrt_llm_backend_repo {
     rm -rf $TENSORRTLLM_BACKEND_DIR && mkdir $TENSORRTLLM_BACKEND_DIR
     apt-get update && apt-get install git-lfs -y --no-install-recommends
-    git clone --single-branch --depth=1 -b ${TENSORRTLLM_BACKEND_REPO_TAG} https://github.com/triton-inference-server/tensorrtllm_backend.git $TENSORRTLLM_BACKEND_DIR
+    git clone --single-branch --depth=1 -b ${TENSORRTLLM_BACKEND_REPO_TAG} ${TRITON_REPO_ORG}/tensorrtllm_backend.git $TENSORRTLLM_BACKEND_DIR
     cd $TENSORRTLLM_BACKEND_DIR && git lfs install && git submodule update --init --recursive
 }
 
 # Update Open MPI to a version compatible with SLURM.
 function upgrade_openmpi {
-    cd /tmp/
     local CURRENT_VERSION=$(mpirun --version 2>&1 | awk '/Open MPI/ {gsub(/rc[0-9]+/, "", $NF); print $NF}')
 
     if [ -n "$CURRENT_VERSION" ] && dpkg --compare-versions "$CURRENT_VERSION" lt "5.0.1"; then
         # Uninstall the current version of Open MPI
-        wget "https://download.open-mpi.org/release/open-mpi/v$(echo "${CURRENT_VERSION}" | awk -F. '{print $1"."$2}')/openmpi-${CURRENT_VERSION}.tar.gz" || {
-            echo "Failed to download Open MPI ${CURRENT_VERSION}"
-            exit 1
-        }
-        rm -rf "openmpi-${CURRENT_VERSION}" && tar -xzf "openmpi-${CURRENT_VERSION}.tar.gz" && cd "openmpi-${CURRENT_VERSION}" || {
-            echo "Failed to extract Open MPI ${CURRENT_VERSION}"
-            exit 1
-        }
-        unset PMIX_VERSION && ./configure --prefix=/opt/hpcx/ompi/ && make uninstall || {
-            echo "Failed to uninstall Open MPI ${CURRENT_VERSION}"
-            exit 1
-        }
-        rm -rf /opt/hpcx/ompi/ /usr/local/mpi/ || {
-            echo "Failed to remove Open MPI ${CURRENT_VERSION} installation directories"
+        rm -r /opt/hpcx/ompi/ /usr/local/mpi && rm -rf /usr/lib/$(gcc -print-multiarch)/openmpi || {
+            echo "Failed to uninstall the existing Open MPI version $CURRENT_VERSION."
             exit 1
         }
-        cd ../ && rm -r openmpi-${CURRENT_VERSION}
     else
-        echo "Installed Open MPI version is not less than 5.0.1. Skipping the upgrade."
+        echo "The installed Open MPI version ($CURRENT_VERSION) is 5.0.1 or higher. Skipping the upgrade."
         return
     fi
 
     # Install SLURM supported Open MPI version
+    cd /tmp/
     wget "https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-5.0.1.tar.gz" || {
         echo "Failed to download Open MPI 5.0.1"
         exit 1
@@ -108,18 +95,6 @@ function upgrade_openmpi {
     mpirun --version
 }
 
-function install_tensorrt_llm {
-    # Install CMake
-    bash ${TENSORRTLLM_BACKEND_DIR}/tensorrt_llm/docker/common/install_cmake.sh
-    export PATH="/usr/local/cmake/bin:${PATH}"
-
-    TORCH_INSTALL_TYPE="pypi" &&
-        (cd ${TENSORRTLLM_BACKEND_DIR}/tensorrt_llm &&
-            bash docker/common/install_pytorch.sh $TORCH_INSTALL_TYPE &&
-            python3 ./scripts/build_wheel.py --trt_root=/usr/local/tensorrt &&
-            pip3 install ./build/tensorrt_llm*.whl)
-}
-
 function build_gpt2_base_model {
     # Download weights from HuggingFace Transformers
     cd ${GPT_DIR} && rm -rf gpt2 && git clone https://huggingface.co/gpt2-medium gpt2 && cd gpt2
@@ -131,24 +106,21 @@ function build_gpt2_base_model {
     cd ${GPT_DIR}
 
     # Convert weights from HF Tranformers to FT format
-    python3 hf_gpt_convert.py -p 1 -i gpt2 -o ./c-model/gpt2 --tensor-parallelism ${NUM_GPUS} --storage-type float16
+    python3 convert_checkpoint.py --model_dir gpt2 --dtype float16 --tp_size ${NUM_GPUS} --output_dir "./c-model/gpt2/${NUM_GPUS}-gpu/"
     cd ${BASE_DIR}
 }
 
 function build_gpt2_tensorrt_engine {
     # Build TensorRT engines
     cd ${GPT_DIR}
-    python3 build.py --model_dir="./c-model/gpt2/${NUM_GPUS}-gpu/" \
-        --world_size="${NUM_GPUS}" \
-        --dtype float16 \
-        --use_inflight_batching \
-        --use_gpt_attention_plugin float16 \
-        --paged_kv_cache \
-        --use_gemm_plugin float16 \
-        --remove_input_padding \
-        --hidden_act gelu \
-        --parallel_build \
-        --output_dir="${ENGINES_DIR}"
+    trtllm-build --checkpoint_dir "./c-model/gpt2/${NUM_GPUS}-gpu/" \
+        --gpt_attention_plugin float16 \
+        --remove_input_padding enable \
+        --paged_kv_cache enable \
+        --gemm_plugin float16 \
+        --workers "${NUM_GPUS}" \
+        --output_dir "${ENGINES_DIR}"
+
     cd ${BASE_DIR}
 }
 
@@ -172,18 +144,18 @@ function prepare_model_repository {
     replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/preprocessing/config.pbtxt"
     replace_config_tags '${preprocessing_instance_count}' '1' "${MODEL_REPOSITORY}/preprocessing/config.pbtxt"
     replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_REPOSITORY}/preprocessing/config.pbtxt"
-    replace_config_tags '${tokenizer_type}' 'auto' "${MODEL_REPOSITORY}/preprocessing/config.pbtxt"
 
     replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/postprocessing/config.pbtxt"
     replace_config_tags '${postprocessing_instance_count}' '1' "${MODEL_REPOSITORY}/postprocessing/config.pbtxt"
     replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_REPOSITORY}/postprocessing/config.pbtxt"
-    replace_config_tags '${tokenizer_type}' 'auto' "${MODEL_REPOSITORY}/postprocessing/config.pbtxt"
 
     replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
     replace_config_tags '${decoupled_mode}' 'true' "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
     replace_config_tags '${max_queue_delay_microseconds}' "1000000" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
     replace_config_tags '${batching_strategy}' 'inflight_fused_batching' "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
     replace_config_tags '${engine_dir}' "${ENGINES_DIR}" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
+    replace_config_tags '${triton_backend}' "tensorrtllm" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
+    replace_config_tags '${max_queue_size}' "0" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
 }
 
 # Wait until server health endpoint shows ready. Sets WAIT_RET to 0 on
@@ -244,13 +216,12 @@ function kill_server {
 
 upgrade_openmpi
 clone_tensorrt_llm_backend_repo
-install_tensorrt_llm
 build_gpt2_base_model
 build_gpt2_tensorrt_engine
 prepare_model_repository
 
 # Install perf_analyzer
-pip3 install tritonclient nvidia-ml-py3
+pip3 install tritonclient
 
 ARCH="amd64"
 STATIC_BATCH=1
diff --git a/qa/L0_perf_vllm/test.sh b/qa/L0_perf_vllm/test.sh
@@ -41,7 +41,7 @@ SERVER_ARGS="--model-repository=${MODEL_REPO} --backend-directory=${BACKEND_DIR}
 export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:=0}
 EXPORT_FILE=profile-export-vllm-model.json
 
-pip3 install tritonclient nvidia-ml-py3
+pip3 install tritonclient
 rm -rf $MODEL_REPO $EXPORT_FILE *.tjson *.json *.csv
 
 mkdir -p $MODEL_REPO/$MODEL_NAME/1