From bf88f4afa758ef0f5643949594a6724e380ae34f Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Mon, 28 Apr 2025 18:03:07 -0700 Subject: [PATCH 1/2] Fix L0_orca_trtllm which was broken due to new changes in the trtllm directory structure and add func defn --- qa/L0_orca/test.sh | 45 ++++++++++++++++++++++++++++++++- qa/L0_perf_tensorrt_llm/test.sh | 4 +-- 2 files changed, 46 insertions(+), 3 deletions(-) diff --git a/qa/L0_orca/test.sh b/qa/L0_orca/test.sh index 6069a75048..61e9a9726a 100755 --- a/qa/L0_orca/test.sh +++ b/qa/L0_orca/test.sh @@ -36,7 +36,7 @@ MODEL_NAME="gpt2_tensorrt_llm" NAME="tensorrt_llm_benchmarking_test" MODEL_REPOSITORY="$(pwd)/triton_model_repo" TENSORRTLLM_BACKEND_DIR="/workspace/tensorrtllm_backend" -GPT_DIR="$TENSORRTLLM_BACKEND_DIR/tensorrt_llm/examples/gpt" +GPT_DIR="$TENSORRTLLM_BACKEND_DIR/tensorrt_llm/examples/models/core/gpt" TOKENIZER_DIR="$GPT_DIR/gpt2" ENGINES_DIR="${BASE_DIR}/engines/inflight_batcher_llm/${NUM_GPUS}-gpu" TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"} @@ -48,6 +48,13 @@ CLIENT_PY=${BASE_DIR}/orca_http_test.py CLIENT_LOG="${NAME}_orca_http_test.log" source ../common/util.sh +function replace_config_tags { + tag_to_replace="${1}" + new_value="${2}" + config_file_path="${3}" + sed -i "s|${tag_to_replace}|${new_value}|g" ${config_file_path} +} + function prepare_model_repository { rm -rf ${MODEL_REPOSITORY} && mkdir ${MODEL_REPOSITORY} cp -r ${TENSORRTLLM_BACKEND_DIR}/all_models/inflight_batcher_llm/* ${MODEL_REPOSITORY} @@ -138,6 +145,42 @@ function kill_server { done } +function clone_tensorrt_llm_backend_repo { + rm -rf $TENSORRTLLM_BACKEND_DIR && mkdir $TENSORRTLLM_BACKEND_DIR + apt-get update && apt-get install git-lfs -y --no-install-recommends + git clone --single-branch --depth=1 -b ${TENSORRTLLM_BACKEND_REPO_TAG} ${TRITON_REPO_ORG}/tensorrtllm_backend.git $TENSORRTLLM_BACKEND_DIR + cd $TENSORRTLLM_BACKEND_DIR && git lfs install && git submodule update --init --recursive +} + +function build_gpt2_base_model { + # Download weights from HuggingFace Transformers + cd ${GPT_DIR} && rm -rf gpt2 && git clone https://huggingface.co/gpt2-medium gpt2 && cd gpt2 + rm pytorch_model.bin model.safetensors + if ! wget -q https://huggingface.co/gpt2-medium/resolve/main/pytorch_model.bin; then + echo "Downloading pytorch_model.bin failed." + exit 1 + fi + cd ${GPT_DIR} + + # Convert weights from HF Tranformers to FT format + python3 convert_checkpoint.py --model_dir gpt2 --dtype float16 --tp_size ${NUM_GPUS} --output_dir "./c-model/gpt2/${NUM_GPUS}-gpu/" + cd ${BASE_DIR} +} + +function build_gpt2_tensorrt_engine { + # Build TensorRT engines + cd ${GPT_DIR} + trtllm-build --checkpoint_dir "./c-model/gpt2/${NUM_GPUS}-gpu/" \ + --gpt_attention_plugin float16 \ + --remove_input_padding enable \ + --paged_kv_cache enable \ + --gemm_plugin float16 \ + --workers "${NUM_GPUS}" \ + --output_dir "${ENGINES_DIR}" + + cd ${BASE_DIR} +} + clone_tensorrt_llm_backend_repo build_gpt2_base_model build_gpt2_tensorrt_engine diff --git a/qa/L0_perf_tensorrt_llm/test.sh b/qa/L0_perf_tensorrt_llm/test.sh index e74b01e568..2e0c64e30c 100755 --- a/qa/L0_perf_tensorrt_llm/test.sh +++ b/qa/L0_perf_tensorrt_llm/test.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -35,7 +35,7 @@ MODEL_NAME="gpt2_tensorrt_llm" NAME="tensorrt_llm_benchmarking_test" MODEL_REPOSITORY="$(pwd)/triton_model_repo" TENSORRTLLM_BACKEND_DIR="/workspace/tensorrtllm_backend" -GPT_DIR="$TENSORRTLLM_BACKEND_DIR/tensorrt_llm/examples/gpt" +GPT_DIR="$TENSORRTLLM_BACKEND_DIR/tensorrt_llm/examples/models/core/gpt" TOKENIZER_DIR="$GPT_DIR/gpt2" ENGINES_DIR="${BASE_DIR}/engines/inflight_batcher_llm/${NUM_GPUS}-gpu" TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"} From 5d32dffbb87d69da6508ea745ee0fa9e01fca66a Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Thu, 15 May 2025 12:17:30 -0700 Subject: [PATCH 2/2] Move common functions to common/util.sh --- qa/L0_orca/test.sh | 77 -------------------------------- qa/L0_perf_tensorrt_llm/test.sh | 71 +---------------------------- qa/common/util.sh | 79 ++++++++++++++++++++++++++++++++- 3 files changed, 79 insertions(+), 148 deletions(-) diff --git a/qa/L0_orca/test.sh b/qa/L0_orca/test.sh index 61e9a9726a..4d0e387b2b 100755 --- a/qa/L0_orca/test.sh +++ b/qa/L0_orca/test.sh @@ -48,47 +48,6 @@ CLIENT_PY=${BASE_DIR}/orca_http_test.py CLIENT_LOG="${NAME}_orca_http_test.log" source ../common/util.sh -function replace_config_tags { - tag_to_replace="${1}" - new_value="${2}" - config_file_path="${3}" - sed -i "s|${tag_to_replace}|${new_value}|g" ${config_file_path} -} - -function prepare_model_repository { - rm -rf ${MODEL_REPOSITORY} && mkdir ${MODEL_REPOSITORY} - cp -r ${TENSORRTLLM_BACKEND_DIR}/all_models/inflight_batcher_llm/* ${MODEL_REPOSITORY} - rm -rf ${MODEL_REPOSITORY}/tensorrt_llm_bls - mv "${MODEL_REPOSITORY}/ensemble" "${MODEL_REPOSITORY}/${MODEL_NAME}" - - replace_config_tags "model_version: -1" "model_version: 1" "${MODEL_REPOSITORY}/${MODEL_NAME}/config.pbtxt" - replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/${MODEL_NAME}/config.pbtxt" - replace_config_tags 'name: "ensemble"' "name: \"$MODEL_NAME\"" "${MODEL_REPOSITORY}/${MODEL_NAME}/config.pbtxt" - replace_config_tags '${logits_datatype}' "TYPE_FP32" "${MODEL_REPOSITORY}/${MODEL_NAME}/config.pbtxt" - - replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/preprocessing/config.pbtxt" - replace_config_tags '${preprocessing_instance_count}' '1' "${MODEL_REPOSITORY}/preprocessing/config.pbtxt" - replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_REPOSITORY}/preprocessing/config.pbtxt" - replace_config_tags '${logits_datatype}' "TYPE_FP32" "${MODEL_REPOSITORY}/preprocessing/config.pbtxt" - replace_config_tags '${max_queue_delay_microseconds}' "1000000" "${MODEL_REPOSITORY}/preprocessing/config.pbtxt" - replace_config_tags '${max_queue_size}' "0" "${MODEL_REPOSITORY}/preprocessing/config.pbtxt" - - replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/postprocessing/config.pbtxt" - replace_config_tags '${postprocessing_instance_count}' '1' "${MODEL_REPOSITORY}/postprocessing/config.pbtxt" - replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_REPOSITORY}/postprocessing/config.pbtxt" - replace_config_tags '${logits_datatype}' "TYPE_FP32" "${MODEL_REPOSITORY}/postprocessing/config.pbtxt" - - replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" - replace_config_tags '${decoupled_mode}' 'true' "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" - replace_config_tags '${max_queue_delay_microseconds}' "1000000" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" - replace_config_tags '${batching_strategy}' 'inflight_fused_batching' "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" - replace_config_tags '${engine_dir}' "${ENGINES_DIR}" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" - replace_config_tags '${triton_backend}' "tensorrtllm" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" - replace_config_tags '${max_queue_size}' "0" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" - replace_config_tags '${logits_datatype}' "TYPE_FP32" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" - replace_config_tags '${encoder_input_features_data_type}' "TYPE_FP32" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" -} - # Wait until server health endpoint shows ready. Sets WAIT_RET to 0 on # success, 1 on failure function wait_for_server_ready() { @@ -145,42 +104,6 @@ function kill_server { done } -function clone_tensorrt_llm_backend_repo { - rm -rf $TENSORRTLLM_BACKEND_DIR && mkdir $TENSORRTLLM_BACKEND_DIR - apt-get update && apt-get install git-lfs -y --no-install-recommends - git clone --single-branch --depth=1 -b ${TENSORRTLLM_BACKEND_REPO_TAG} ${TRITON_REPO_ORG}/tensorrtllm_backend.git $TENSORRTLLM_BACKEND_DIR - cd $TENSORRTLLM_BACKEND_DIR && git lfs install && git submodule update --init --recursive -} - -function build_gpt2_base_model { - # Download weights from HuggingFace Transformers - cd ${GPT_DIR} && rm -rf gpt2 && git clone https://huggingface.co/gpt2-medium gpt2 && cd gpt2 - rm pytorch_model.bin model.safetensors - if ! wget -q https://huggingface.co/gpt2-medium/resolve/main/pytorch_model.bin; then - echo "Downloading pytorch_model.bin failed." - exit 1 - fi - cd ${GPT_DIR} - - # Convert weights from HF Tranformers to FT format - python3 convert_checkpoint.py --model_dir gpt2 --dtype float16 --tp_size ${NUM_GPUS} --output_dir "./c-model/gpt2/${NUM_GPUS}-gpu/" - cd ${BASE_DIR} -} - -function build_gpt2_tensorrt_engine { - # Build TensorRT engines - cd ${GPT_DIR} - trtllm-build --checkpoint_dir "./c-model/gpt2/${NUM_GPUS}-gpu/" \ - --gpt_attention_plugin float16 \ - --remove_input_padding enable \ - --paged_kv_cache enable \ - --gemm_plugin float16 \ - --workers "${NUM_GPUS}" \ - --output_dir "${ENGINES_DIR}" - - cd ${BASE_DIR} -} - clone_tensorrt_llm_backend_repo build_gpt2_base_model build_gpt2_tensorrt_engine diff --git a/qa/L0_perf_tensorrt_llm/test.sh b/qa/L0_perf_tensorrt_llm/test.sh index 2e0c64e30c..98c5cb1d76 100755 --- a/qa/L0_perf_tensorrt_llm/test.sh +++ b/qa/L0_perf_tensorrt_llm/test.sh @@ -43,13 +43,7 @@ SERVER=${TRITON_DIR}/bin/tritonserver BACKEND_DIR=${TRITON_DIR}/backends SERVER_LOG="${NAME}_server.log" SERVER_TIMEOUT=${SERVER_TIMEOUT:=120} - -function clone_tensorrt_llm_backend_repo { - rm -rf $TENSORRTLLM_BACKEND_DIR && mkdir $TENSORRTLLM_BACKEND_DIR - apt-get update && apt-get install git-lfs -y --no-install-recommends - git clone --single-branch --depth=1 -b ${TENSORRTLLM_BACKEND_REPO_TAG} ${TRITON_REPO_ORG}/tensorrtllm_backend.git $TENSORRTLLM_BACKEND_DIR - cd $TENSORRTLLM_BACKEND_DIR && git lfs install && git submodule update --init --recursive -} +source ../common/util.sh # Update Open MPI to a version compatible with SLURM. function upgrade_openmpi { @@ -95,69 +89,6 @@ function upgrade_openmpi { mpirun --version } -function build_gpt2_base_model { - # Download weights from HuggingFace Transformers - cd ${GPT_DIR} && rm -rf gpt2 && git clone https://huggingface.co/gpt2-medium gpt2 && cd gpt2 - rm pytorch_model.bin model.safetensors - if ! wget -q https://huggingface.co/gpt2-medium/resolve/main/pytorch_model.bin; then - echo "Downloading pytorch_model.bin failed." - exit 1 - fi - cd ${GPT_DIR} - - # Convert weights from HF Tranformers to FT format - python3 convert_checkpoint.py --model_dir gpt2 --dtype float16 --tp_size ${NUM_GPUS} --output_dir "./c-model/gpt2/${NUM_GPUS}-gpu/" - cd ${BASE_DIR} -} - -function build_gpt2_tensorrt_engine { - # Build TensorRT engines - cd ${GPT_DIR} - trtllm-build --checkpoint_dir "./c-model/gpt2/${NUM_GPUS}-gpu/" \ - --gpt_attention_plugin float16 \ - --remove_input_padding enable \ - --paged_kv_cache enable \ - --gemm_plugin float16 \ - --workers "${NUM_GPUS}" \ - --output_dir "${ENGINES_DIR}" - - cd ${BASE_DIR} -} - -function replace_config_tags { - tag_to_replace="${1}" - new_value="${2}" - config_file_path="${3}" - sed -i "s|${tag_to_replace}|${new_value}|g" ${config_file_path} -} - -function prepare_model_repository { - rm -rf ${MODEL_REPOSITORY} && mkdir ${MODEL_REPOSITORY} - cp -r ${TENSORRTLLM_BACKEND_DIR}/all_models/inflight_batcher_llm/* ${MODEL_REPOSITORY} - rm -rf ${MODEL_REPOSITORY}/tensorrt_llm_bls - mv "${MODEL_REPOSITORY}/ensemble" "${MODEL_REPOSITORY}/${MODEL_NAME}" - - replace_config_tags "model_version: -1" "model_version: 1" "${MODEL_REPOSITORY}/${MODEL_NAME}/config.pbtxt" - replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/${MODEL_NAME}/config.pbtxt" - replace_config_tags 'name: "ensemble"' "name: \"$MODEL_NAME\"" "${MODEL_REPOSITORY}/${MODEL_NAME}/config.pbtxt" - - replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/preprocessing/config.pbtxt" - replace_config_tags '${preprocessing_instance_count}' '1' "${MODEL_REPOSITORY}/preprocessing/config.pbtxt" - replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_REPOSITORY}/preprocessing/config.pbtxt" - - replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/postprocessing/config.pbtxt" - replace_config_tags '${postprocessing_instance_count}' '1' "${MODEL_REPOSITORY}/postprocessing/config.pbtxt" - replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_REPOSITORY}/postprocessing/config.pbtxt" - - replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" - replace_config_tags '${decoupled_mode}' 'true' "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" - replace_config_tags '${max_queue_delay_microseconds}' "1000000" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" - replace_config_tags '${batching_strategy}' 'inflight_fused_batching' "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" - replace_config_tags '${engine_dir}' "${ENGINES_DIR}" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" - replace_config_tags '${triton_backend}' "tensorrtllm" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" - replace_config_tags '${max_queue_size}' "0" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" -} - # Wait until server health endpoint shows ready. Sets WAIT_RET to 0 on # success, 1 on failure function wait_for_server_ready() { diff --git a/qa/common/util.sh b/qa/common/util.sh index 3874916573..545b3ec736 100755 --- a/qa/common/util.sh +++ b/qa/common/util.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2018-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -534,3 +534,80 @@ function deactivate_virtualenv() { rm -fr venv fi } + +function replace_config_tags { + tag_to_replace="${1}" + new_value="${2}" + config_file_path="${3}" + sed -i "s|${tag_to_replace}|${new_value}|g" ${config_file_path} +} + +function prepare_model_repository { + rm -rf ${MODEL_REPOSITORY} && mkdir ${MODEL_REPOSITORY} + cp -r ${TENSORRTLLM_BACKEND_DIR}/all_models/inflight_batcher_llm/* ${MODEL_REPOSITORY} + rm -rf ${MODEL_REPOSITORY}/tensorrt_llm_bls + mv "${MODEL_REPOSITORY}/ensemble" "${MODEL_REPOSITORY}/${MODEL_NAME}" + + replace_config_tags "model_version: -1" "model_version: 1" "${MODEL_REPOSITORY}/${MODEL_NAME}/config.pbtxt" + replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/${MODEL_NAME}/config.pbtxt" + replace_config_tags 'name: "ensemble"' "name: \"$MODEL_NAME\"" "${MODEL_REPOSITORY}/${MODEL_NAME}/config.pbtxt" + replace_config_tags '${logits_datatype}' "TYPE_FP32" "${MODEL_REPOSITORY}/${MODEL_NAME}/config.pbtxt" + + replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/preprocessing/config.pbtxt" + replace_config_tags '${preprocessing_instance_count}' '1' "${MODEL_REPOSITORY}/preprocessing/config.pbtxt" + replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_REPOSITORY}/preprocessing/config.pbtxt" + replace_config_tags '${logits_datatype}' "TYPE_FP32" "${MODEL_REPOSITORY}/preprocessing/config.pbtxt" + replace_config_tags '${max_queue_delay_microseconds}' "1000000" "${MODEL_REPOSITORY}/preprocessing/config.pbtxt" + replace_config_tags '${max_queue_size}' "0" "${MODEL_REPOSITORY}/preprocessing/config.pbtxt" + + replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/postprocessing/config.pbtxt" + replace_config_tags '${postprocessing_instance_count}' '1' "${MODEL_REPOSITORY}/postprocessing/config.pbtxt" + replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_REPOSITORY}/postprocessing/config.pbtxt" + replace_config_tags '${logits_datatype}' "TYPE_FP32" "${MODEL_REPOSITORY}/postprocessing/config.pbtxt" + + replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" + replace_config_tags '${decoupled_mode}' 'true' "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" + replace_config_tags '${max_queue_delay_microseconds}' "1000000" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" + replace_config_tags '${batching_strategy}' 'inflight_fused_batching' "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" + replace_config_tags '${engine_dir}' "${ENGINES_DIR}" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" + replace_config_tags '${triton_backend}' "tensorrtllm" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" + replace_config_tags '${max_queue_size}' "0" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" + replace_config_tags '${logits_datatype}' "TYPE_FP32" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" + replace_config_tags '${encoder_input_features_data_type}' "TYPE_FP32" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" +} + +function clone_tensorrt_llm_backend_repo { + rm -rf $TENSORRTLLM_BACKEND_DIR && mkdir $TENSORRTLLM_BACKEND_DIR + apt-get update && apt-get install git-lfs -y --no-install-recommends + git clone --single-branch --depth=1 -b ${TENSORRTLLM_BACKEND_REPO_TAG} ${TRITON_REPO_ORG}/tensorrtllm_backend.git $TENSORRTLLM_BACKEND_DIR + cd $TENSORRTLLM_BACKEND_DIR && git lfs install && git submodule update --init --recursive +} + +function build_gpt2_base_model { + # Download weights from HuggingFace Transformers + cd ${GPT_DIR} && rm -rf gpt2 && git clone https://huggingface.co/gpt2-medium gpt2 && cd gpt2 + rm pytorch_model.bin model.safetensors + if ! wget -q https://huggingface.co/gpt2-medium/resolve/main/pytorch_model.bin; then + echo "Downloading pytorch_model.bin failed." + exit 1 + fi + cd ${GPT_DIR} + + # Convert weights from HF Tranformers to FT format + python3 convert_checkpoint.py --model_dir gpt2 --dtype float16 --tp_size ${NUM_GPUS} --output_dir "./c-model/gpt2/${NUM_GPUS}-gpu/" + cd ${BASE_DIR} +} + +function build_gpt2_tensorrt_engine { + # Build TensorRT engines + cd ${GPT_DIR} + trtllm-build --checkpoint_dir "./c-model/gpt2/${NUM_GPUS}-gpu/" \ + --gpt_attention_plugin float16 \ + --remove_input_padding enable \ + --paged_kv_cache enable \ + --gemm_plugin float16 \ + --workers "${NUM_GPUS}" \ + --output_dir "${ENGINES_DIR}" + + cd ${BASE_DIR} +} \ No newline at end of file