Skip to content

Commit 1380155

Browse files
ci: fix the trtllm tests after the repo migration of trtllm backend (#8241)
1 parent af8bb93 commit 1380155

File tree

3 files changed

+51
-8
lines changed

3 files changed

+51
-8
lines changed

python/openai/openai_frontend/engine/utils/triton.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ def _create_trtllm_inference_request(
129129
if request.presence_penalty is not None:
130130
inputs["presence_penalty"] = np.float32([[request.presence_penalty]])
131131
if request.seed is not None:
132-
inputs["random_seed"] = np.uint64([[request.seed]])
132+
inputs["seed"] = np.uint64([[request.seed]])
133133
if request.temperature is not None:
134134
inputs["temperature"] = np.float32([[request.temperature]])
135135

qa/L0_orca/test.sh

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ MODEL_NAME="gpt2_tensorrt_llm"
3636
NAME="tensorrt_llm_benchmarking_test"
3737
MODEL_REPOSITORY="$(pwd)/triton_model_repo"
3838
TENSORRTLLM_BACKEND_DIR="/workspace/tensorrtllm_backend"
39-
GPT_DIR="$TENSORRTLLM_BACKEND_DIR/tensorrt_llm/examples/gpt"
39+
GPT_DIR="$TENSORRTLLM_BACKEND_DIR/tensorrt_llm/examples/models/core/gpt"
4040
TOKENIZER_DIR="$GPT_DIR/gpt2"
4141
ENGINES_DIR="${BASE_DIR}/engines/inflight_batcher_llm/${NUM_GPUS}-gpu"
4242
TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"}
@@ -48,9 +48,52 @@ CLIENT_PY=${BASE_DIR}/orca_http_test.py
4848
CLIENT_LOG="${NAME}_orca_http_test.log"
4949
source ../common/util.sh
5050

51+
function clone_tensorrt_llm_backend_repo {
52+
rm -rf $TENSORRTLLM_BACKEND_DIR && mkdir $TENSORRTLLM_BACKEND_DIR
53+
apt-get update && apt-get install git-lfs -y --no-install-recommends
54+
git clone --single-branch --depth=1 -b ${TENSORRTLLM_BACKEND_REPO_TAG} ${TRITON_REPO_ORG}/tensorrtllm_backend.git $TENSORRTLLM_BACKEND_DIR
55+
cd $TENSORRTLLM_BACKEND_DIR && git lfs install && git submodule update --init --recursive
56+
}
57+
58+
function build_gpt2_base_model {
59+
# Download weights from HuggingFace Transformers
60+
cd ${GPT_DIR} && rm -rf gpt2 && git clone https://huggingface.co/gpt2-medium gpt2 && cd gpt2
61+
rm pytorch_model.bin model.safetensors
62+
if ! wget -q https://huggingface.co/gpt2-medium/resolve/main/pytorch_model.bin; then
63+
echo "Downloading pytorch_model.bin failed."
64+
exit 1
65+
fi
66+
cd ${GPT_DIR}
67+
68+
# Convert weights from HF Tranformers to FT format
69+
python3 convert_checkpoint.py --model_dir gpt2 --dtype float16 --tp_size ${NUM_GPUS} --output_dir "./c-model/gpt2/${NUM_GPUS}-gpu/"
70+
cd ${BASE_DIR}
71+
}
72+
73+
function build_gpt2_tensorrt_engine {
74+
# Build TensorRT engines
75+
cd ${GPT_DIR}
76+
trtllm-build --checkpoint_dir "./c-model/gpt2/${NUM_GPUS}-gpu/" \
77+
--gpt_attention_plugin float16 \
78+
--remove_input_padding enable \
79+
--paged_kv_cache enable \
80+
--gemm_plugin float16 \
81+
--workers "${NUM_GPUS}" \
82+
--output_dir "${ENGINES_DIR}"
83+
84+
cd ${BASE_DIR}
85+
}
86+
87+
function replace_config_tags {
88+
tag_to_replace="${1}"
89+
new_value="${2}"
90+
config_file_path="${3}"
91+
sed -i "s|${tag_to_replace}|${new_value}|g" ${config_file_path}
92+
}
93+
5194
function prepare_model_repository {
5295
rm -rf ${MODEL_REPOSITORY} && mkdir ${MODEL_REPOSITORY}
53-
cp -r ${TENSORRTLLM_BACKEND_DIR}/all_models/inflight_batcher_llm/* ${MODEL_REPOSITORY}
96+
cp -r ${TENSORRTLLM_BACKEND_DIR}/tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/* ${MODEL_REPOSITORY}
5497
rm -rf ${MODEL_REPOSITORY}/tensorrt_llm_bls
5598
mv "${MODEL_REPOSITORY}/ensemble" "${MODEL_REPOSITORY}/${MODEL_NAME}"
5699

@@ -113,7 +156,7 @@ function wait_for_server_ready() {
113156
}
114157

115158
function run_server {
116-
python3 ${TENSORRTLLM_BACKEND_DIR}/scripts/launch_triton_server.py --world_size="${NUM_GPUS}" --model_repo="${MODEL_REPOSITORY}" >${SERVER_LOG} 2>&1 &
159+
python3 ${TENSORRTLLM_BACKEND_DIR}/tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size="${NUM_GPUS}" --model_repo="${MODEL_REPOSITORY}" >${SERVER_LOG} 2>&1 &
117160
sleep 2 # allow time to obtain the pid(s)
118161
# Read PIDs into an array, trimming whitespaces
119162
readarray -t SERVER_PID < <(pgrep "tritonserver")

qa/L0_perf_tensorrt_llm/test.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#!/bin/bash
2-
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
33
#
44
# Redistribution and use in source and binary forms, with or without
55
# modification, are permitted provided that the following conditions
@@ -35,7 +35,7 @@ MODEL_NAME="gpt2_tensorrt_llm"
3535
NAME="tensorrt_llm_benchmarking_test"
3636
MODEL_REPOSITORY="$(pwd)/triton_model_repo"
3737
TENSORRTLLM_BACKEND_DIR="/workspace/tensorrtllm_backend"
38-
GPT_DIR="$TENSORRTLLM_BACKEND_DIR/tensorrt_llm/examples/gpt"
38+
GPT_DIR="$TENSORRTLLM_BACKEND_DIR/tensorrt_llm/examples/models/core/gpt"
3939
TOKENIZER_DIR="$GPT_DIR/gpt2"
4040
ENGINES_DIR="${BASE_DIR}/engines/inflight_batcher_llm/${NUM_GPUS}-gpu"
4141
TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"}
@@ -133,7 +133,7 @@ function replace_config_tags {
133133

134134
function prepare_model_repository {
135135
rm -rf ${MODEL_REPOSITORY} && mkdir ${MODEL_REPOSITORY}
136-
cp -r ${TENSORRTLLM_BACKEND_DIR}/all_models/inflight_batcher_llm/* ${MODEL_REPOSITORY}
136+
cp -r ${TENSORRTLLM_BACKEND_DIR}/tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/* ${MODEL_REPOSITORY}
137137
rm -rf ${MODEL_REPOSITORY}/tensorrt_llm_bls
138138
mv "${MODEL_REPOSITORY}/ensemble" "${MODEL_REPOSITORY}/${MODEL_NAME}"
139139

@@ -189,7 +189,7 @@ function wait_for_server_ready() {
189189
}
190190

191191
function run_server {
192-
python3 ${TENSORRTLLM_BACKEND_DIR}/scripts/launch_triton_server.py --world_size="${NUM_GPUS}" --model_repo="${MODEL_REPOSITORY}" >${SERVER_LOG} 2>&1 &
192+
python3 ${TENSORRTLLM_BACKEND_DIR}/tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size="${NUM_GPUS}" --model_repo="${MODEL_REPOSITORY}" >${SERVER_LOG} 2>&1 &
193193
sleep 2 # allow time to obtain the pid(s)
194194
# Read PIDs into an array, trimming whitespaces
195195
readarray -t SERVER_PID < <(pgrep "tritonserver")

0 commit comments

Comments
 (0)