Skip to content

Commit 02fd134

Browse files
authored
[None] [feat] Enhancements to slurm scripts (#10031)
Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
1 parent 6649c37 commit 02fd134

File tree

4 files changed

+116
-181
lines changed

4 files changed

+116
-181
lines changed

examples/disaggregated/slurm/benchmark/disaggr_torch.slurm

Lines changed: 24 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -4,40 +4,19 @@ set -euo pipefail
44
# Parse named arguments
55
while [[ $# -gt 0 ]]; do
66
case $1 in
7-
# Worker configuration
8-
--num-ctx-servers) num_ctx_servers="$2"; shift 2 ;;
9-
--num-gen-servers) num_gen_servers="$2"; shift 2 ;;
10-
--concurrency-list) concurrency_list="$2"; shift 2 ;;
11-
12-
# Sequence and benchmark parameters
13-
--isl) isl="$2"; shift 2 ;;
14-
--osl) osl="$2"; shift 2 ;;
15-
--multi-round) multi_round="$2"; shift 2 ;;
16-
--benchmark-ratio) benchmark_ratio="$2"; shift 2 ;;
17-
--streaming) streaming="$2"; shift 2 ;;
18-
--use-nv-sa-benchmark) use_nv_sa_benchmark="$2"; shift 2 ;;
7+
# Benchmark Configuration
198
--benchmark-mode) benchmark_mode="$2"; shift 2 ;;
209

2110
# Environment and paths
22-
--dataset-file) dataset_file="$2"; shift 2 ;;
23-
--model-path) model_path="$2"; shift 2 ;;
2411
--trtllm-repo) trtllm_repo="$2"; shift 2 ;;
2512
--work-dir) work_dir="$2"; shift 2 ;;
2613
--full-logdir) full_logdir="$2"; shift 2 ;;
14+
--container-name) container_name="$2"; shift 2 ;;
2715
--container-mount) container_mount="$2"; shift 2 ;;
2816
--container-image) container_image="$2"; shift 2 ;;
2917
--build-wheel) build_wheel="$2"; shift 2 ;;
3018
--cuda-architectures) cuda_architectures="$2"; shift 2 ;;
3119
--trtllm-wheel-path) trtllm_wheel_path="$2"; shift 2 ;;
32-
33-
# Accuracy evaluation
34-
--enable-accuracy-test) enable_accuracy_test="$2"; shift 2 ;;
35-
--accuracy-model) accuracy_model="$2"; shift 2 ;;
36-
--accuracy-tasks) accuracy_tasks="$2"; shift 2 ;;
37-
--model-args-extra) model_args_extra="$2"; shift 2 ;;
38-
39-
# Server environment variables
40-
--server-env-var) server_env_var="$2"; shift 2 ;;
4120
*)
4221
echo "Unknown argument: $1"
4322
exit 1
@@ -48,58 +27,32 @@ done
4827
# Print all parsed arguments
4928
echo "Parsed arguments:"
5029
echo
51-
echo "Worker Configuration:"
52-
echo " num_ctx_servers: ${num_ctx_servers}"
53-
echo " num_gen_servers: ${num_gen_servers}"
54-
echo " concurrency_list: ${concurrency_list}"
55-
echo
5630
echo "Benchmark Configuration:"
57-
echo " isl: ${isl}"
58-
echo " osl: ${osl}"
59-
echo " multi_round: ${multi_round}"
60-
echo " benchmark_ratio: ${benchmark_ratio}"
61-
echo " streaming: ${streaming}"
62-
echo " use_nv_sa_benchmark: ${use_nv_sa_benchmark}"
6331
echo " benchmark_mode: ${benchmark_mode}"
6432
echo
6533
echo "Environment Configuration:"
66-
echo " dataset_file: ${dataset_file}"
67-
echo " model_path: ${model_path}"
6834
echo " trtllm_repo: ${trtllm_repo}"
6935
echo " work_dir: ${work_dir}"
7036
echo " full_logdir: ${full_logdir}"
7137
echo " container_mount: ${container_mount}"
7238
echo " container_image: ${container_image}"
7339
echo " build_wheel: ${build_wheel}"
40+
echo " cuda_architectures: ${cuda_architectures}"
7441
echo " trtllm_wheel_path: ${trtllm_wheel_path}"
75-
echo
76-
echo "Accuracy Configuration:"
77-
echo " enable_accuracy_test: ${enable_accuracy_test}"
78-
echo " accuracy_model: ${accuracy_model}"
79-
echo " accuracy_tasks: ${accuracy_tasks}"
80-
echo " model_args_extra: ${model_args_extra}"
81-
echo
82-
echo "Server Environment Variables:"
83-
echo " server_env_var: ${server_env_var}"
8442

8543
# Set TRTLLM_DISAGG_BENCHMARK_GEN_ONLY=1 for gen_only_no_context mode
8644
if [ "${benchmark_mode}" = "gen_only_no_context" ]; then
8745
export TRTLLM_DISAGG_BENCHMARK_GEN_ONLY=1
88-
worker_env_var="${worker_env_var} TRTLLM_DISAGG_BENCHMARK_GEN_ONLY=1"
89-
server_env_var="${server_env_var} TRTLLM_DISAGG_BENCHMARK_GEN_ONLY=1"
9046
echo "Setting TRTLLM_DISAGG_BENCHMARK_GEN_ONLY=1 for gen_only_no_context mode"
9147
fi
9248

93-
container_name="disaggr-test"
94-
9549
# Function to cleanup on failure
9650
cleanup_on_failure() {
9751
echo "Error: $1"
9852
scancel ${SLURM_JOB_ID}
9953
exit 1
10054
}
10155

102-
echo "SLURM_JOB_ID: ${SLURM_JOB_ID}" > ${full_logdir}/job_info.txt
10356
env > ${full_logdir}/environment.txt
10457

10558
# Start container
@@ -155,6 +108,11 @@ elif [ -d "${trtllm_repo}" ]; then
155108
cleanup_on_failure "TensorRT-LLM installation failed. Check ${full_logdir}/2_install.log for details"
156109
fi
157110
echo "TensorRT-LLM installation completed successfully"
111+
else
112+
echo "trtllm_wheel_path and trtllm_repo are not provided, will use the installed TensorRT-LLM from the container"
113+
if [ -v TRT_LLM_GIT_COMMIT ]; then
114+
echo "TRT_LLM_GIT_COMMIT: ${TRT_LLM_GIT_COMMIT}"
115+
fi
158116
fi
159117

160118
# Get node lists and replace the placeholder with the actual node names
@@ -163,39 +121,30 @@ all_nodes=($(scontrol show hostname $SLURM_NODELIST | sort))
163121
all_nodes_str=$(IFS=','; echo "${all_nodes[*]}")
164122
echo "all_nodes_str: ${all_nodes_str}"
165123

166-
start_worker_cmds_file=${full_logdir}/start_worker_cmds.txt
124+
start_server_cmds_file=${full_logdir}/start_server_cmds.sh
167125
IFS=',' read -r -a node_array <<< "$all_nodes_str"
168126
for i in "${!node_array[@]}"; do
169127
current_val="${node_array[$i]}"
170128
placeholder="<node${i}_placeholder>"
171129

172130
# Use sed to replace the placeholder with the value in-place
173-
sed -i "s|$placeholder|$current_val|g" "${start_worker_cmds_file}"
131+
sed -i "s|$placeholder|$current_val|g" "${start_server_cmds_file}"
174132
echo "Replaced $placeholder with $current_val"
175133
done
176134

177-
# start the workers (skip ctx workers if TRTLLM_DISAGG_BENCHMARK_GEN_ONLY is set).
178-
echo "Starting worker commands from ${start_worker_cmds_file}..."
179-
cat ${start_worker_cmds_file} | while read cmd; do
135+
# start the servers (skip ctx workers if TRTLLM_DISAGG_BENCHMARK_GEN_ONLY is set).
136+
echo "Starting worker commands from ${start_server_cmds_file}..."
137+
cat ${start_server_cmds_file} | while read cmd; do
180138
# Skip ctx worker commands if in gen-only mode
181139
# CTX appears as argument to start_worker.sh and in log filename
182140
if [ "${TRTLLM_DISAGG_BENCHMARK_GEN_ONLY:-0}" = "1" ] && [[ "$cmd" == *"start_worker.sh CTX"* ]]; then
183141
echo "Skipping ctx worker command (TRTLLM_DISAGG_BENCHMARK_GEN_ONLY is set): ${cmd}"
184142
continue
185143
fi
186-
echo "Starting worker command: ${cmd}"
144+
echo "Executing command: ${cmd}"
187145
eval "${cmd}"
188146
done
189147

190-
# start the server (in background)
191-
echo "Starting server..."
192-
srun -l --container-name=${container_name} \
193-
--container-image=${container_image} \
194-
--container-mounts=${container_mount} \
195-
--mpi=pmix --overlap -N 1 -n 1 \
196-
bash ${work_dir}/start_server.sh ${num_ctx_servers} ${num_gen_servers} ${full_logdir} ${work_dir} "${server_env_var}" \
197-
&> ${full_logdir}/4_output_server.log &
198-
199148
# Wait for server to be ready (runs synchronously)
200149
echo "Waiting for server to be ready..."
201150
if ! srun -l --container-name=${container_name} \
@@ -207,47 +156,18 @@ if ! srun -l --container-name=${container_name} \
207156
fi
208157
echo "Server is ready!"
209158

210-
# Start benchmarking
211-
echo "Starting benchmark..."
212-
if [ "${use_nv_sa_benchmark}" = "true" ]; then
213-
echo "Using NVIDIA SA benchmark script..."
214-
if ! srun -l --container-name=${container_name} \
215-
--container-mounts=${container_mount} \
216-
--mpi=pmix --overlap -N 1 -n 1 \
217-
bash ${work_dir}/run_benchmark_nv_sa.sh \
218-
"${model_path}" "${isl}" "${osl}" "${benchmark_ratio}" "${multi_round}" "${num_gen_servers}" "${concurrency_list}" "${streaming}" "${full_logdir}/" \
219-
&> ${full_logdir}/6_bench.log; then
220-
cleanup_on_failure "NVIDIA SA benchmark failed. Check ${full_logdir}/6_bench.log for details"
221-
fi
222-
else
223-
echo "Using default benchmark script..."
224-
if ! srun -l --container-name=${container_name} \
225-
--container-mounts=${container_mount} \
226-
--mpi=pmix --overlap -N 1 -n 1 \
227-
bash ${work_dir}/run_benchmark.sh \
228-
"${model_path}" "${dataset_file}" "${multi_round}" "${num_gen_servers}" "${concurrency_list}" "${streaming}" "${full_logdir}/" \
229-
&> ${full_logdir}/6_bench.log; then
230-
cleanup_on_failure "Benchmark failed. Check ${full_logdir}/6_bench.log for details"
231-
fi
232-
fi
233-
echo "Benchmark completed successfully"
234-
235-
# Run accuracy evaluation if enabled
236-
if [ "${enable_accuracy_test}" = "true" ]; then
237-
echo "Starting accuracy evaluation..."
238-
if ! srun -l --container-name=${container_name} \
239-
--container-mounts=${container_mount} \
240-
--mpi=pmix --overlap -N 1 -n 1 \
241-
bash ${work_dir}/accuracy_eval.sh \
242-
"${full_logdir}" "${accuracy_model}" "${accuracy_tasks}" "${model_path}" \
243-
"${model_args_extra}" "${full_logdir}/accuracy_eval" \
244-
&> ${full_logdir}/7_accuracy_eval.log; then
245-
cleanup_on_failure "Accuracy evaluation failed. Check ${full_logdir}/7_accuracy_eval.log for details"
159+
# Start client commands
160+
client_cmds_file=${full_logdir}/client_cmds.sh
161+
echo "Starting client commands from ${client_cmds_file}..."
162+
while read -r cmd <&3; do
163+
echo "Starting client command: ${cmd}"
164+
eval "${cmd}"
165+
if [ $? -ne 0 ]; then
166+
cleanup_on_failure "Command failed: ${cmd}."
246167
fi
247-
echo "Accuracy evaluation completed successfully"
248-
fi
168+
done 3< "${client_cmds_file}"
249169

250-
echo "Total runtime: $SECONDS seconds"
170+
echo "Job completed successfully, total runtime: $SECONDS seconds"
251171

252172
# try to kill the server and workers
253173
scancel ${SLURM_JOB_ID}

examples/disaggregated/slurm/benchmark/run_benchmark.sh

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,3 @@ for concurrency in ${concurrency_list}; do
6262
$(if [ "${streaming}" = "false" ]; then echo "--non-streaming"; fi)
6363
echo "Benchmark with concurrency ${concurrency} done"
6464
done
65-
66-
job_id=${SLURM_JOB_ID}
67-
if [ -n "${job_id}" ]; then
68-
echo "${SLURM_JOB_NODELIST}" > ${log_path}/job_${job_id}.txt
69-
fi

examples/disaggregated/slurm/benchmark/run_benchmark_nv_sa.sh

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -89,8 +89,3 @@ for concurrency in ${concurrency_list}; do
8989

9090
echo "Benchmark with concurrency ${concurrency} done"
9191
done
92-
93-
# Save job information
94-
if [ -n "${SLURM_JOB_ID:-}" ]; then
95-
echo "${SLURM_JOB_NODELIST}" > "${log_path}/job_${SLURM_JOB_ID}.txt"
96-
fi

0 commit comments

Comments
 (0)