88
99# Number of prefill and decode instances to create
1010NUM_PREFILL_INSTANCES=${NUM_PREFILL_INSTANCES:- 1} # Default to 1
11- NUM_DECODE_INSTANCES=${NUM_DECODE_INSTANCES:- 2} # Default to 2
11+ NUM_DECODE_INSTANCES=${NUM_DECODE_INSTANCES:- 1} # Default to 1
12+ PREFILLER_TP_SIZE=${PREFILLER_TP_SIZE:- 1}
13+ DECODER_TP_SIZE=${DECODER_TP_SIZE:- 1}
1214
1315# Find the git repository root directory
1416GIT_ROOT=$( git rev-parse --show-toplevel)
@@ -44,40 +46,6 @@ get_model_args() {
4446 echo " $extra_args "
4547}
4648
47- set_cli_args () {
48- PREFILLER_TP_SIZE=1
49- DECODER_TP_SIZE=1
50- # Iterate through the rest of the arguments
51- while [[ $# -gt 0 ]]; do
52- echo $#
53- case " $1 " in
54- --prefiller-tp-size)
55- if [[ -n " $2 " ]]; then
56- PREFILLER_TP_SIZE=" $2 "
57- shift 2 # Consume the flag and its value ($2)
58- else
59- echo " Error: --prefiller-tp-size requires a value." >&2
60- exit 1
61- fi
62- ;;
63- --decoder-tp-size)
64- if [[ -n " $2 " ]]; then
65- DECODER_TP_SIZE=" $2 "
66- shift 2
67- else
68- echo " Error: --decoder-tp-size requires a value." >&2
69- exit 1
70- fi
71- ;;
72- * )
73- # Handle any arguments not recognized
74- shift # Ignore unknown argument
75- ;;
76- esac
77- done
78- }
79-
80-
8149# Function to run tests for a specific model
8250run_tests_for_model () {
8351 local model_name=$1
@@ -87,7 +55,6 @@ run_tests_for_model() {
8755
8856 # Get model-specific arguments
8957 local model_args=$( get_model_args " $model_name " )
90- set_cli_args " $@ "
9158
9259 # Arrays to store all hosts and ports
9360 PREFILL_HOSTS=()
@@ -100,15 +67,16 @@ run_tests_for_model() {
10067 # Calculate GPU ID - we'll distribute across available GPUs
10168 GPU_ID=$(( i % $(nvidia- smi -- query- gpu= name -- format= csv, noheader | wc - l)) )
10269
70+
10371 # Calculate port number (base port + instance number)
10472 PORT=$(( 8100 + i))
10573 # Calculate side channel port. Avoid clash with with TP workers.
106- SIDE_CHANNEL_PORT=$(( 5559 + i * $PREFILLER_TP_SIZE ))
74+ SIDE_CHANNEL_PORT=$(( 5559 + i))
10775
10876 echo " Starting prefill instance $i on GPU $GPU_ID , port $PORT "
10977
11078 # Build the command with or without model-specific args
111- BASE_CMD=" VLLM_WORKER_MULTIPROC_METHOD=spawn VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
79+ BASE_CMD=" CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
11280 --port $PORT \
11381 --enforce-eager \
11482 --disable-log-requests \
@@ -137,12 +105,12 @@ run_tests_for_model() {
137105 # Calculate port number (base port + instance number)
138106 PORT=$(( 8200 + i))
139107 # Calculate side channel port
140- SIDE_CHANNEL_PORT=$(( 5659 + i * $PREFILLER_TP_SIZE ))
108+ SIDE_CHANNEL_PORT=$(( 5659 + i * $DECODER_TP_SIZE ))
141109
142110 echo " Starting decode instance $i on GPU $GPU_ID , port $PORT "
143111
144112 # Build the command with or without model-specific args
145- BASE_CMD=" VLLM_WORKER_MULTIPROC_METHOD=spawn VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
113+ BASE_CMD=" CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
146114 --port $PORT \
147115 --enforce-eager \
148116 --disable-log-requests \
@@ -203,7 +171,7 @@ run_tests_for_model() {
203171
204172# Run tests for each model
205173for model in " ${MODELS[@]} " ; do
206- run_tests_for_model " $model " " $@ "
174+ run_tests_for_model " $model "
207175done
208176
209177echo " All tests completed!"
0 commit comments