1+ #! /bin/bash
2+ set -xe
3+
4+ # Hosts / ports
5+ PREFILL_HOST=${PREFILL_HOST:- " localhost" }
6+ PREFILL_PORT=${PREFILL_PORT:- 8100}
7+ PREFILL_NIXL_SIDE_PORT=${PREFILL_NIXL_SIDE_PORT:- 5577}
8+ DECODE_HOST=${DECODE_HOST:- " localhost" }
9+ DECODE_PORT=${DECODE_PORT:- 8200}
10+ PROXY_HOST=${PROXY_HOST:- " localhost" }
11+ PROXY_PORT=${PROXY_PORT:- 8192}
12+ BASELINE_HOST=${BASELINE_HOST:- " localhost" }
13+ BASELINE_PORT=${BASELINE_PORT:- 9290}
14+
15+
16+ # Model to run.
17+ MODEL_NAME=${MODEL_NAME:- " meta-llama/Llama-3.2-3B-Instruct" }
18+ MAX_MODEL_LEN=${MAX_MODEL_LEN:- 1024}
19+ BLOCK_SIZE=${BLOCK_SIZE:- 32}
20+
21+
22+ # execution env
23+ GIT_ROOT=$( git rev-parse --show-toplevel)
24+ EXP_ROOT=" ${GIT_ROOT} /tests/v1/kv_connector/nixl_integration"
25+ CONDA_PATH=${CONDA_PATH:- " /home/${USER} /anaconda3" }
26+ CONDA_ENV_NAME=${CONDA_ENV_NAME:- " nixl" }
27+
28+ OUTPUT_FILE=${OUTPUT_FILE:- " ${EXP_ROOT} /.tpu_accuracy_test_outputs.txt" }
29+
30+ # Trap the SIGINT signal (triggered by Ctrl+C)
31+ trap ' kill $(jobs -pr)' SIGINT SIGTERM EXIT
32+
33+
34+ # Waits for vLLM server to start.
35+ wait_for_server () {
36+ local host=$1
37+ local port=$2
38+ timeout 1200 bash -c "
39+ until curl -s ${host} :${port} /v1/completions > /dev/null; do
40+ sleep 1
41+ done" && return 0 || return 1
42+ }
43+
44+ # Cleanup function
45+ cleanup () {
46+ echo " Caught Ctrl+C, cleaning up..."
47+ # Cleanup commands
48+ pgrep python | xargs kill -9 || true
49+ # pkill -f python || true
50+ echo " Cleanup complete. Exiting."
51+ }
52+
53+ launch_baseline () {
54+ BASELINE_BASE_CMD=" source ${CONDA_PATH} /bin/activate ${CONDA_ENV_NAME} ;
55+ VLLM_LOGGING_LEVEL=DEBUG \
56+ VLLM_USE_V1=1 \
57+ PJRT_DEVICE=TPU \
58+ VLLM_WORKER_MULTIPROC_METHOD=spawn \
59+ VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
60+ --host ${BASELINE_HOST} \
61+ --port ${BASELINE_PORT} \
62+ --max-model-len ${MAX_MODEL_LEN} \
63+ --seed 42 \
64+ --block-size ${BLOCK_SIZE} \
65+ --gpu-memory-utilization 0.5 \
66+ --enforce-eager"
67+ echo ${BASELINE_BASE_CMD}
68+ ssh -tt ${BASELINE_HOST} " ${BASELINE_BASE_CMD} " &
69+ }
70+
71+ launch_pd () {
72+ PREFILL_BASE_CMD=" source ${CONDA_PATH} /bin/activate ${CONDA_ENV_NAME} ;
73+ UCX_TLS=tcp \
74+ VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
75+ VLLM_LOGGING_LEVEL=DEBUG \
76+ VLLM_USE_V1=1 \
77+ VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \
78+ VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \
79+ PJRT_DEVICE=TPU \
80+ VLLM_WORKER_MULTIPROC_METHOD=spawn \
81+ VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
82+ --host ${PREFILL_HOST} \
83+ --port ${PREFILL_PORT} \
84+ --max-model-len ${MAX_MODEL_LEN} \
85+ --seed 42 \
86+ --block-size ${BLOCK_SIZE} \
87+ --enforce-eager \
88+ --gpu-memory-utilization 0.5 \
89+ --kv-transfer-config '{\" kv_connector\" :\" NixlConnector\" ,\" kv_role\" :\" kv_both\" ,\" kv_buffer_device\" :\" cpu\" }'"
90+
91+
92+ DECODE_BASE_CMD=" source ${CONDA_PATH} /bin/activate ${CONDA_ENV_NAME} ;
93+ UCX_TLS=tcp \
94+ VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
95+ VLLM_LOGGING_LEVEL=DEBUG \
96+ VLLM_USE_V1=1 \
97+ PJRT_DEVICE=TPU \
98+ VLLM_WORKER_MULTIPROC_METHOD=spawn \
99+ VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
100+ --host ${DECODE_HOST} \
101+ --port ${DECODE_PORT} \
102+ --max-model-len ${MAX_MODEL_LEN} \
103+ --seed 42 \
104+ --block-size ${BLOCK_SIZE} \
105+ --enforce-eager \
106+ --gpu-memory-utilization 0.5 \
107+ --kv-transfer-config '{\" kv_connector\" :\" NixlConnector\" ,\" kv_role\" :\" kv_both\" ,\" kv_buffer_device\" :\" cpu\" }'"
108+
109+ echo ${PREFILL_BASE_CMD}
110+ echo ${DECODE_BASE_CMD}
111+ sleep 2
112+
113+ # execute on hosts
114+ ssh -tt ${PREFILL_HOST} " ${PREFILL_BASE_CMD} " &
115+ ssh -tt ${DECODE_HOST} " ${DECODE_BASE_CMD} " &
116+ sleep 1
117+ wait_for_server ${PREFILL_HOST} ${PREFILL_PORT}
118+ sleep 1
119+ wait_for_server ${DECODE_HOST} ${DECODE_PORT}
120+ sleep 1
121+ }
122+
123+ launch_pd_proxy (){
124+ PROXY_BASE_CMD=" source ${CONDA_PATH} /bin/activate ${CONDA_ENV_NAME} ;
125+ python3 ${EXP_ROOT} /toy_proxy_server.py \
126+ --prefiller-host ${PREFILL_HOST} --prefiller-port ${PREFILL_PORT} \
127+ --decoder-host ${DECODE_HOST} --decoder-port ${DECODE_PORT} \
128+ --host=${PROXY_HOST} --port ${PROXY_PORT} "
129+ echo ${PROXY_BASE_CMD}
130+ ssh -tt ${PROXY_HOST} " ${PROXY_BASE_CMD} " &
131+ }
132+
133+ run_tests (){
134+ local service_url=$1
135+ local mode=$2
136+ python3 ${EXP_ROOT} /test_disagg_accuracy.py --service_url=${service_url} --model_name=${MODEL_NAME} --mode=${mode} --file_name=${OUTPUT_FILE}
137+ }
138+
139+
140+ # run non-disagg. baseline & save outputs
141+ launch_baseline
142+ sleep 2
143+ wait_for_server ${BASELINE_HOST} ${BASELINE_PORT}
144+ run_tests " http://${BASELINE_HOST} :${BASELINE_PORT} " " baseline"
145+ cleanup
146+ sleep 10
147+
148+
149+ # run disagg. & do exact-match with the outputs from baseline
150+ launch_pd
151+ launch_pd_proxy
152+ sleep 10
153+ run_tests " http://${PROXY_HOST} :${PROXY_PORT} " " disagg"
154+ echo " -----P/D success----"
155+
156+ rm ${OUTPUT_FILE}
157+ cleanup
158+
159+ exit 0
0 commit comments