@@ -4,40 +4,19 @@ set -euo pipefail
44# Parse named arguments
55while [[ $# -gt 0 ]]; do
66 case $1 in
7- # Worker configuration
8- --num-ctx-servers) num_ctx_servers=" $2 " ; shift 2 ;;
9- --num-gen-servers) num_gen_servers=" $2 " ; shift 2 ;;
10- --concurrency-list) concurrency_list=" $2 " ; shift 2 ;;
11-
12- # Sequence and benchmark parameters
13- --isl) isl=" $2 " ; shift 2 ;;
14- --osl) osl=" $2 " ; shift 2 ;;
15- --multi-round) multi_round=" $2 " ; shift 2 ;;
16- --benchmark-ratio) benchmark_ratio=" $2 " ; shift 2 ;;
17- --streaming) streaming=" $2 " ; shift 2 ;;
18- --use-nv-sa-benchmark) use_nv_sa_benchmark=" $2 " ; shift 2 ;;
7+ # Benchmark Configuration
198 --benchmark-mode) benchmark_mode=" $2 " ; shift 2 ;;
209
2110 # Environment and paths
22- --dataset-file) dataset_file=" $2 " ; shift 2 ;;
23- --model-path) model_path=" $2 " ; shift 2 ;;
2411 --trtllm-repo) trtllm_repo=" $2 " ; shift 2 ;;
2512 --work-dir) work_dir=" $2 " ; shift 2 ;;
2613 --full-logdir) full_logdir=" $2 " ; shift 2 ;;
14+ --container-name) container_name=" $2 " ; shift 2 ;;
2715 --container-mount) container_mount=" $2 " ; shift 2 ;;
2816 --container-image) container_image=" $2 " ; shift 2 ;;
2917 --build-wheel) build_wheel=" $2 " ; shift 2 ;;
3018 --cuda-architectures) cuda_architectures=" $2 " ; shift 2 ;;
3119 --trtllm-wheel-path) trtllm_wheel_path=" $2 " ; shift 2 ;;
32-
33- # Accuracy evaluation
34- --enable-accuracy-test) enable_accuracy_test=" $2 " ; shift 2 ;;
35- --accuracy-model) accuracy_model=" $2 " ; shift 2 ;;
36- --accuracy-tasks) accuracy_tasks=" $2 " ; shift 2 ;;
37- --model-args-extra) model_args_extra=" $2 " ; shift 2 ;;
38-
39- # Server environment variables
40- --server-env-var) server_env_var=" $2 " ; shift 2 ;;
4120 * )
4221 echo " Unknown argument: $1 "
4322 exit 1
4827# Print all parsed arguments
4928echo " Parsed arguments:"
5029echo
51- echo " Worker Configuration:"
52- echo " num_ctx_servers: ${num_ctx_servers} "
53- echo " num_gen_servers: ${num_gen_servers} "
54- echo " concurrency_list: ${concurrency_list} "
55- echo
5630echo " Benchmark Configuration:"
57- echo " isl: ${isl} "
58- echo " osl: ${osl} "
59- echo " multi_round: ${multi_round} "
60- echo " benchmark_ratio: ${benchmark_ratio} "
61- echo " streaming: ${streaming} "
62- echo " use_nv_sa_benchmark: ${use_nv_sa_benchmark} "
6331echo " benchmark_mode: ${benchmark_mode} "
6432echo
6533echo " Environment Configuration:"
66- echo " dataset_file: ${dataset_file} "
67- echo " model_path: ${model_path} "
6834echo " trtllm_repo: ${trtllm_repo} "
6935echo " work_dir: ${work_dir} "
7036echo " full_logdir: ${full_logdir} "
7137echo " container_mount: ${container_mount} "
7238echo " container_image: ${container_image} "
7339echo " build_wheel: ${build_wheel} "
40+ echo " cuda_architectures: ${cuda_architectures} "
7441echo " trtllm_wheel_path: ${trtllm_wheel_path} "
75- echo
76- echo " Accuracy Configuration:"
77- echo " enable_accuracy_test: ${enable_accuracy_test} "
78- echo " accuracy_model: ${accuracy_model} "
79- echo " accuracy_tasks: ${accuracy_tasks} "
80- echo " model_args_extra: ${model_args_extra} "
81- echo
82- echo " Server Environment Variables:"
83- echo " server_env_var: ${server_env_var} "
8442
8543# Set TRTLLM_DISAGG_BENCHMARK_GEN_ONLY=1 for gen_only_no_context mode
8644if [ " ${benchmark_mode} " = " gen_only_no_context" ]; then
8745 export TRTLLM_DISAGG_BENCHMARK_GEN_ONLY=1
88- worker_env_var=" ${worker_env_var} TRTLLM_DISAGG_BENCHMARK_GEN_ONLY=1"
89- server_env_var=" ${server_env_var} TRTLLM_DISAGG_BENCHMARK_GEN_ONLY=1"
9046 echo " Setting TRTLLM_DISAGG_BENCHMARK_GEN_ONLY=1 for gen_only_no_context mode"
9147fi
9248
93- container_name=" disaggr-test"
94-
9549# Function to cleanup on failure
9650cleanup_on_failure () {
9751 echo " Error: $1 "
9852 scancel ${SLURM_JOB_ID}
9953 exit 1
10054}
10155
102- echo " SLURM_JOB_ID: ${SLURM_JOB_ID} " > ${full_logdir} /job_info.txt
10356env > ${full_logdir} /environment.txt
10457
10558# Start container
@@ -155,6 +108,11 @@ elif [ -d "${trtllm_repo}" ]; then
155108 cleanup_on_failure " TensorRT-LLM installation failed. Check ${full_logdir} /2_install.log for details"
156109 fi
157110 echo " TensorRT-LLM installation completed successfully"
111+ else
112+ echo " trtllm_wheel_path and trtllm_repo are not provided, will use the installed TensorRT-LLM from the container"
113+ if [ -v TRT_LLM_GIT_COMMIT ]; then
114+ echo " TRT_LLM_GIT_COMMIT: ${TRT_LLM_GIT_COMMIT} "
115+ fi
158116fi
159117
160118# Get node lists and replace the placeholder with the actual node names
@@ -163,39 +121,30 @@ all_nodes=($(scontrol show hostname $SLURM_NODELIST | sort))
163121all_nodes_str=$( IFS=' ,' ; echo " ${all_nodes[*]} " )
164122echo " all_nodes_str: ${all_nodes_str} "
165123
166- start_worker_cmds_file =${full_logdir} /start_worker_cmds.txt
124+ start_server_cmds_file =${full_logdir} /start_server_cmds.sh
167125IFS=' ,' read -r -a node_array <<< " $all_nodes_str"
168126for i in " ${! node_array[@]} " ; do
169127 current_val=" ${node_array[$i]} "
170128 placeholder=" <node${i} _placeholder>"
171129
172130 # Use sed to replace the placeholder with the value in-place
173- sed -i " s|$placeholder |$current_val |g" " ${start_worker_cmds_file } "
131+ sed -i " s|$placeholder |$current_val |g" " ${start_server_cmds_file } "
174132 echo " Replaced $placeholder with $current_val "
175133done
176134
177- # start the workers (skip ctx workers if TRTLLM_DISAGG_BENCHMARK_GEN_ONLY is set).
178- echo " Starting worker commands from ${start_worker_cmds_file } ..."
179- cat ${start_worker_cmds_file } | while read cmd; do
135+ # start the servers (skip ctx workers if TRTLLM_DISAGG_BENCHMARK_GEN_ONLY is set).
136+ echo " Starting worker commands from ${start_server_cmds_file } ..."
137+ cat ${start_server_cmds_file } | while read cmd; do
180138 # Skip ctx worker commands if in gen-only mode
181139 # CTX appears as argument to start_worker.sh and in log filename
182140 if [ " ${TRTLLM_DISAGG_BENCHMARK_GEN_ONLY:- 0} " = " 1" ] && [[ " $cmd " == * " start_worker.sh CTX" * ]]; then
183141 echo " Skipping ctx worker command (TRTLLM_DISAGG_BENCHMARK_GEN_ONLY is set): ${cmd} "
184142 continue
185143 fi
186- echo " Starting worker command: ${cmd} "
144+ echo " Executing command: ${cmd} "
187145 eval " ${cmd} "
188146done
189147
190- # start the server (in background)
191- echo " Starting server..."
192- srun -l --container-name=${container_name} \
193- --container-image=${container_image} \
194- --container-mounts=${container_mount} \
195- --mpi=pmix --overlap -N 1 -n 1 \
196- bash ${work_dir} /start_server.sh ${num_ctx_servers} ${num_gen_servers} ${full_logdir} ${work_dir} " ${server_env_var} " \
197- & > ${full_logdir} /4_output_server.log &
198-
199148# Wait for server to be ready (runs synchronously)
200149echo " Waiting for server to be ready..."
201150if ! srun -l --container-name=${container_name} \
@@ -207,47 +156,18 @@ if ! srun -l --container-name=${container_name} \
207156fi
208157echo " Server is ready!"
209158
210- # Start benchmarking
211- echo " Starting benchmark..."
212- if [ " ${use_nv_sa_benchmark} " = " true" ]; then
213- echo " Using NVIDIA SA benchmark script..."
214- if ! srun -l --container-name=${container_name} \
215- --container-mounts=${container_mount} \
216- --mpi=pmix --overlap -N 1 -n 1 \
217- bash ${work_dir} /run_benchmark_nv_sa.sh \
218- " ${model_path} " " ${isl} " " ${osl} " " ${benchmark_ratio} " " ${multi_round} " " ${num_gen_servers} " " ${concurrency_list} " " ${streaming} " " ${full_logdir} /" \
219- & > ${full_logdir} /6_bench.log; then
220- cleanup_on_failure " NVIDIA SA benchmark failed. Check ${full_logdir} /6_bench.log for details"
221- fi
222- else
223- echo " Using default benchmark script..."
224- if ! srun -l --container-name=${container_name} \
225- --container-mounts=${container_mount} \
226- --mpi=pmix --overlap -N 1 -n 1 \
227- bash ${work_dir} /run_benchmark.sh \
228- " ${model_path} " " ${dataset_file} " " ${multi_round} " " ${num_gen_servers} " " ${concurrency_list} " " ${streaming} " " ${full_logdir} /" \
229- & > ${full_logdir} /6_bench.log; then
230- cleanup_on_failure " Benchmark failed. Check ${full_logdir} /6_bench.log for details"
231- fi
232- fi
233- echo " Benchmark completed successfully"
234-
235- # Run accuracy evaluation if enabled
236- if [ " ${enable_accuracy_test} " = " true" ]; then
237- echo " Starting accuracy evaluation..."
238- if ! srun -l --container-name=${container_name} \
239- --container-mounts=${container_mount} \
240- --mpi=pmix --overlap -N 1 -n 1 \
241- bash ${work_dir} /accuracy_eval.sh \
242- " ${full_logdir} " " ${accuracy_model} " " ${accuracy_tasks} " " ${model_path} " \
243- " ${model_args_extra} " " ${full_logdir} /accuracy_eval" \
244- & > ${full_logdir} /7_accuracy_eval.log; then
245- cleanup_on_failure " Accuracy evaluation failed. Check ${full_logdir} /7_accuracy_eval.log for details"
159+ # Start client commands
160+ client_cmds_file=${full_logdir} /client_cmds.sh
161+ echo " Starting client commands from ${client_cmds_file} ..."
162+ while read -r cmd < & 3; do
163+ echo " Starting client command: ${cmd} "
164+ eval " ${cmd} "
165+ if [ $? -ne 0 ]; then
166+ cleanup_on_failure " Command failed: ${cmd} ."
246167 fi
247- echo " Accuracy evaluation completed successfully"
248- fi
168+ done 3< " ${client_cmds_file} "
249169
250- echo " Total runtime: $SECONDS seconds"
170+ echo " Job completed successfully, total runtime: $SECONDS seconds"
251171
252172# try to kill the server and workers
253173scancel ${SLURM_JOB_ID}
0 commit comments