Skip to content

Commit 79b7a2f

Browse files
committed
modify hpu accuracy test
1 parent e2e76ab commit 79b7a2f

File tree

1 file changed

+195
-132
lines changed

1 file changed

+195
-132
lines changed
Lines changed: 195 additions & 132 deletions
Original file line numberDiff line numberDiff line change
@@ -1,159 +1,222 @@
11
#!/bin/bash
22
set -xe
33

4-
# Hosts / ports
5-
PREFILL_HOST=${PREFILL_HOST:-"localhost"}
6-
PREFILL_PORT=${PREFILL_PORT:-8100}
7-
PREFILL_NIXL_SIDE_PORT=${PREFILL_NIXL_SIDE_PORT:-5577}
8-
DECODE_HOST=${DECODE_HOST:-"localhost"}
9-
DECODE_PORT=${DECODE_PORT:-8200}
10-
PROXY_HOST=${PROXY_HOST:-"localhost"}
11-
PROXY_PORT=${PROXY_PORT:-8192}
12-
BASELINE_HOST=${BASELINE_HOST:-"localhost"}
13-
BASELINE_PORT=${BASELINE_PORT:-9290}
4+
# Models to run
5+
MODELS=(
6+
"Qwen/Qwen3-0.6B"
7+
)
8+
MODELS=(
9+
"meta-llama/Llama-3.1-8B"
10+
)
1411

12+
export VLLM_SKIP_WARMUP="true"
13+
#export PT_HPU_LAZY_MODE=1
1514

16-
# Model to run.
17-
MODEL_NAME=${MODEL_NAME:-"meta-llama/Llama-3.2-3B-Instruct"}
18-
MAX_MODEL_LEN=${MAX_MODEL_LEN:-1024}
19-
BLOCK_SIZE=${BLOCK_SIZE:-32}
15+
# Number of prefill and decode instances to create
16+
NUM_PREFILL_INSTANCES=${NUM_PREFILL_INSTANCES:-1} # Default to 1
17+
NUM_DECODE_INSTANCES=${NUM_DECODE_INSTANCES:-1} # Default to 1
18+
PREFILLER_TP_SIZE=${PREFILLER_TP_SIZE:-1}
19+
DECODER_TP_SIZE=${DECODER_TP_SIZE:-1}
2020

21+
# Find the git repository root directory
22+
#GIT_ROOT=$(git rev-parse --show-toplevel)
23+
GIT_ROOT="/home/vllm-nixl/vllm"
2124

22-
# execution env
23-
GIT_ROOT=$(git rev-parse --show-toplevel)
24-
EXP_ROOT="${GIT_ROOT}/tests/v1/kv_connector/nixl_integration"
25-
CONDA_PATH=${CONDA_PATH:-"/home/${USER}/anaconda3"}
26-
CONDA_ENV_NAME=${CONDA_ENV_NAME:-"nixl"}
27-
28-
OUTPUT_FILE=${OUTPUT_FILE:-"${EXP_ROOT}/.tpu_accuracy_test_outputs.txt"}
25+
#SMI_BIN=$(which nvidia-smi || which rocm-smi)
2926

3027
# Trap the SIGINT signal (triggered by Ctrl+C)
3128
trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT
3229

33-
34-
# Waits for vLLM server to start.
30+
# Waits for vLLM to start.
3531
wait_for_server() {
36-
local host=$1
37-
local port=$2
32+
local port=$1
3833
timeout 1200 bash -c "
39-
until curl -s ${host}:${port}/v1/completions > /dev/null; do
40-
sleep 1
34+
until curl -s localhost:${port}/v1/completions > /dev/null; do
35+
sleep 1
4136
done" && return 0 || return 1
4237
}
4338

44-
# Cleanup function
45-
cleanup() {
46-
echo "Caught Ctrl+C, cleaning up..."
47-
# Cleanup commands
48-
pgrep python | xargs kill -9 || true
49-
# pkill -f python || true
50-
echo "Cleanup complete. Exiting."
39+
# Function to clean up previous instances
40+
cleanup_instances() {
41+
echo "Cleaning up any running vLLM instances..."
42+
pkill -f "vllm serve" || true
43+
sleep 2
5144
}
5245

53-
launch_baseline() {
54-
BASELINE_BASE_CMD="source ${CONDA_PATH}/bin/activate ${CONDA_ENV_NAME};
55-
VLLM_LOGGING_LEVEL=DEBUG \
56-
VLLM_USE_V1=1 \
57-
PJRT_DEVICE=TPU \
58-
VLLM_WORKER_MULTIPROC_METHOD=spawn \
59-
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
60-
--host ${BASELINE_HOST} \
61-
--port ${BASELINE_PORT} \
62-
--max-model-len ${MAX_MODEL_LEN}\
63-
--seed 42 \
64-
--block-size ${BLOCK_SIZE} \
65-
--gpu-memory-utilization 0.5 \
66-
--enforce-eager"
67-
echo ${BASELINE_BASE_CMD}
68-
ssh -tt ${BASELINE_HOST} "${BASELINE_BASE_CMD}" &
69-
}
46+
# Handle to get model-specific arguments for deepseek
47+
get_model_args() {
48+
local model_name=$1
49+
local extra_args=""
7050

71-
launch_pd() {
72-
PREFILL_BASE_CMD="source ${CONDA_PATH}/bin/activate ${CONDA_ENV_NAME};
73-
UCX_TLS=tcp \
74-
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
75-
VLLM_LOGGING_LEVEL=DEBUG \
76-
VLLM_USE_V1=1 \
77-
VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \
78-
VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \
79-
PJRT_DEVICE=TPU \
80-
VLLM_WORKER_MULTIPROC_METHOD=spawn \
81-
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
82-
--host ${PREFILL_HOST} \
83-
--port ${PREFILL_PORT} \
84-
--max-model-len ${MAX_MODEL_LEN}\
85-
--seed 42 \
86-
--block-size ${BLOCK_SIZE} \
87-
--enforce-eager \
88-
--gpu-memory-utilization 0.5 \
89-
--kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
90-
91-
92-
DECODE_BASE_CMD="source ${CONDA_PATH}/bin/activate ${CONDA_ENV_NAME};
93-
UCX_TLS=tcp \
94-
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
95-
VLLM_LOGGING_LEVEL=DEBUG \
96-
VLLM_USE_V1=1 \
97-
PJRT_DEVICE=TPU \
98-
VLLM_WORKER_MULTIPROC_METHOD=spawn \
99-
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
100-
--host ${DECODE_HOST} \
101-
--port ${DECODE_PORT} \
102-
--max-model-len ${MAX_MODEL_LEN}\
103-
--seed 42 \
104-
--block-size ${BLOCK_SIZE} \
105-
--enforce-eager \
106-
--gpu-memory-utilization 0.5 \
107-
--kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
108-
109-
echo ${PREFILL_BASE_CMD}
110-
echo ${DECODE_BASE_CMD}
111-
sleep 2
51+
if [[ "$model_name" == "deepseek-ai/deepseek-vl2-tiny" ]]; then
52+
extra_args="--hf_overrides '{\"architectures\": [\"DeepseekVLV2ForCausalLM\"]}' --trust-remote-code"
53+
fi
11254

113-
# execute on hosts
114-
ssh -tt ${PREFILL_HOST} "${PREFILL_BASE_CMD}" &
115-
ssh -tt ${DECODE_HOST} "${DECODE_BASE_CMD}" &
116-
sleep 1
117-
wait_for_server ${PREFILL_HOST} ${PREFILL_PORT}
118-
sleep 1
119-
wait_for_server ${DECODE_HOST} ${DECODE_PORT}
120-
sleep 1
55+
echo "$extra_args"
12156
}
12257

123-
launch_pd_proxy(){
124-
PROXY_BASE_CMD="source ${CONDA_PATH}/bin/activate ${CONDA_ENV_NAME};
125-
python3 ${EXP_ROOT}/toy_proxy_server.py \
126-
--prefiller-host ${PREFILL_HOST} --prefiller-port ${PREFILL_PORT} \
127-
--decoder-host ${DECODE_HOST} --decoder-port ${DECODE_PORT} \
128-
--host=${PROXY_HOST} --port ${PROXY_PORT}"
129-
echo ${PROXY_BASE_CMD}
130-
ssh -tt ${PROXY_HOST} "${PROXY_BASE_CMD}" &
58+
get_num_gpus() {
59+
if [[ "$SMI_BIN" == *"nvidia"* ]]; then
60+
echo "$($SMI_BIN --query-gpu=name --format=csv,noheader | wc -l)"
61+
else
62+
echo "$($SMI_BIN -l | grep GPU | wc -l)"
63+
fi
13164
}
13265

133-
run_tests(){
134-
local service_url=$1
135-
local mode=$2
136-
python3 ${EXP_ROOT}/test_disagg_accuracy.py --service_url=${service_url} --model_name=${MODEL_NAME} --mode=${mode} --file_name=${OUTPUT_FILE}
66+
# Function to run tests for a specific model
67+
run_tests_for_model() {
68+
local model_name=$1
69+
echo "================================"
70+
echo "Testing model: $model_name"
71+
echo "================================"
72+
73+
# Get model-specific arguments
74+
local model_args=$(get_model_args "$model_name")
75+
76+
# Arrays to store all hosts and ports
77+
PREFILL_HOSTS=()
78+
PREFILL_PORTS=()
79+
DECODE_HOSTS=()
80+
DECODE_PORTS=()
81+
82+
# Start prefill instances
83+
for i in $(seq 0 $((NUM_PREFILL_INSTANCES-1))); do
84+
# Calculate GPU ID - we'll distribute across available GPUs
85+
#GPU_ID=$((i % $(get_num_gpus)))
86+
GPU_ID=2
87+
88+
# Calculate port number (base port + instance number)
89+
PORT=$((8300 + i))
90+
# Calculate side channel port. Avoid clash with with TP workers.
91+
SIDE_CHANNEL_PORT=$((6559 + i))
92+
93+
echo "Starting prefill instance $i on GPU $GPU_ID, port $PORT"
94+
95+
# Build the command with or without model-specific args
96+
BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
97+
--port $PORT \
98+
--enforce-eager \
99+
--disable-log-requests \
100+
--gpu-memory-utilization 0.3 \
101+
--tensor-parallel-size $PREFILLER_TP_SIZE \
102+
--kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
103+
104+
if [ -n "$model_args" ]; then
105+
FULL_CMD="$BASE_CMD $model_args"
106+
else
107+
FULL_CMD="$BASE_CMD"
108+
fi
109+
110+
eval "$FULL_CMD &"
111+
112+
# Store host and port for proxy configuration
113+
PREFILL_HOSTS+=("localhost")
114+
PREFILL_PORTS+=($PORT)
115+
done
116+
117+
# Start decode instances
118+
for i in $(seq 0 $((NUM_DECODE_INSTANCES-1))); do
119+
# Calculate GPU ID - we'll distribute across available GPUs, starting from after prefill GPUs
120+
#GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(get_num_gpus)))
121+
GPU_ID=6
122+
# Calculate port number (base port + instance number)
123+
PORT=$((8400 + i))
124+
# Calculate side channel port
125+
SIDE_CHANNEL_PORT=$((5659 + i * $DECODER_TP_SIZE))
126+
127+
echo "Starting decode instance $i on GPU $GPU_ID, port $PORT"
128+
129+
# Build the command with or without model-specific args
130+
BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
131+
--port $PORT \
132+
--enforce-eager \
133+
--disable-log-requests \
134+
--gpu-memory-utilization 0.3 \
135+
--tensor-parallel-size $DECODER_TP_SIZE \
136+
--kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
137+
138+
if [ -n "$model_args" ]; then
139+
FULL_CMD="$BASE_CMD $model_args"
140+
else
141+
FULL_CMD="$BASE_CMD"
142+
fi
143+
144+
eval "$FULL_CMD &"
145+
146+
# Store host and port for proxy configuration
147+
DECODE_HOSTS+=("localhost")
148+
DECODE_PORTS+=($PORT)
149+
done
150+
151+
# Wait for all instances to start
152+
for PORT in "${PREFILL_PORTS[@]}"; do
153+
echo "Waiting for prefill instance on port $PORT to start..."
154+
wait_for_server $PORT
155+
done
156+
157+
for PORT in "${DECODE_PORTS[@]}"; do
158+
echo "Waiting for decode instance on port $PORT to start..."
159+
wait_for_server $PORT
160+
done
161+
162+
# Build the command for the proxy server with all the hosts and ports
163+
PROXY_CMD="python toy_proxy_server.py --port 9192"
164+
165+
# Add all prefill hosts and ports
166+
PROXY_CMD+=" --prefiller-hosts ${PREFILL_HOSTS[@]}"
167+
PROXY_CMD+=" --prefiller-ports ${PREFILL_PORTS[@]}"
168+
169+
# Add all decode hosts and ports
170+
PROXY_CMD+=" --decoder-hosts ${DECODE_HOSTS[@]}"
171+
PROXY_CMD+=" --decoder-ports ${DECODE_PORTS[@]}"
172+
173+
# Start the proxy server
174+
echo "Starting proxy server with command: $PROXY_CMD"
175+
$PROXY_CMD &
176+
177+
# Wait for the proxy to start
178+
sleep 10
179+
180+
# curl -X POST -s http://localhost:9192/v1/completions \
181+
# -H "Content-Type: application/json" \
182+
# -d '{
183+
# "model": "meta-llama/Llama-3.1-8B",
184+
# "prompt": "Mark Elliot Zuckerberg is an American businessman who co-founded the social media service Facebook and its parent company Meta Platforms, of which he is the chairman, chief executive officer, and controlling shareholder. Zuckerberg has been the subject of multiple lawsuits regarding the creation and ownership of the website as well as issues such as user privacy. Born in White Plains, New York, Zuckerberg briefly attended Harvard College, where he launched Facebook in February 2004 with his roommates Eduardo Saverin, Andrew McCollum, Dustin Moskovitz and Chris Hughes. Zuckerberg took the company public in May 2012 with majority shares. He became the worlds youngest self-made billionaire[a] in 2008, at age 23, and has consistently ranked among the worlds wealthiest individuals. According to Forbes, Zuckerbergs estimated net worth stood at US$221.2 billion as of May 2025, making him the second-richest individual in the world.[2]",
185+
# "max_tokens": 5,
186+
# "temperature": 0
187+
# }'
188+
sleep 5
189+
echo "--------------------===================-------------"
190+
curl -X POST -s http://localhost:9192/v1/completions \
191+
-H "Content-Type: application/json" \
192+
-d '{
193+
"model": "meta-llama/Llama-3.1-8B",
194+
"prompt": "Mark Elliot Zuckerberg is an American businessman who co-founded the social media service Facebook and its parent company Meta Platforms, of which he is the chairman, chief executive officer, and controlling shareholder. Zuckerberg has been the subject of multiple lawsuits regarding the creation and ownership of the website as well as issues such as user privacy. Born in White Plains, New York, Zuckerberg briefly attended Harvard College, where he launched Facebook in February 2004 with his roommates Eduardo Saverin, Andrew McCollum, Dustin Moskovitz and Chris Hughes. Zuckerberg took the company public in May 2012 with majority shares. He became the worlds youngest self-made billionaire[a] in 2008, at age 23, and has consistently ranked among the worlds wealthiest individuals. According to Forbes, Zuckerbergs estimated net worth stood at US$221.2 billion as of May 2025, making him the second-richest individual in the world.[2] Intel opened its first international manufacturing facility in 1972, in Malaysia, which would host multiple Intel operations, before opening assembly facilities and semiconductor plants in Singapore and Jerusalem in the early 1980s, and manufacturing and development centers in China, India, and Costa Rica in the 1990s.[31] By the early 1980s, its business was dominated by DRAM chips. However, increased competition from Japanese semiconductor manufacturers had, by 1983, dramatically reduced the profitability of this market. The growing success of the IBM personal computer, based on an Intel microprocessor, was among factors that convinced Gordon Moore (CEO since 1975) to shift the companys focus to microprocessors and to change fundamental aspects of that business model. Moores decision to sole-source Intels 386 chip played into the companys continuing success.",
195+
"max_tokens": 5,
196+
"temperature": 0
197+
}'
198+
curl -X POST -s http://localhost:9192/v1/completions \
199+
-H "Content-Type: application/json" \
200+
-d '{
201+
"model": "meta-llama/Llama-3.1-8B",
202+
"prompt": ["This was a few months ago. It was my day off and the only thing I had to do was pick my girlfriend up from work at 9:00 pm. Other than that, I was free to loaf on the couch from morning to night, which is what I did. Around 8:00, I decided to shower before I left the house. Now, I have short hair that dries pretty quickly, but I am deeply vain about it, so I always dry it with the hairdryer right after I shower to ensure my hair doesnt get flat and weird. I never skip this step. So, I get out of the shower, start drying my hair... And then I wake up in bed. Its half an hour later. I feel like garbage, my entire body mysteriously hurts, and I am slowly realizing that I dont remember exiting the bathroom. My only clear thought is: oh shit, its 9:00! I have to pick up my girlfriend! Better shake myself awake. I dragged my aching carcass back to the bathroom, and this was when I noticed the massive blisters forming all over my hand. I was still pretty out of it, but I knew that this was a hospital visit kind of burn. My girlfriend then called to check in because I was running late and, despite my undoubtedly convincing argument that I was still perfectly fine to drive, she immediately knew something was wrong. She cabbed home and we got a ride to the ER. Turns out, I had my first ever seizure! It seems like during the seizure, I clenched the hairdryer in my fist and had it pointed at my other hand long enough to thoroughly cook it. The tissue loss is pretty deep in some areas and there was concerns about me retaining my mobility, but its been healing well so far.",
203+
"Mark Elliot Zuckerberg is an American businessman who co-founded the social media service Facebook and its parent company Meta Platforms, of which he is the chairman, chief executive officer, and controlling shareholder. Zuckerberg has been the subject of multiple lawsuits regarding the creation and ownership of the website as well as issues such as user privacy. Born in White Plains, New York, Zuckerberg briefly attended Harvard College, where he launched Facebook in February 2004 with his roommates Eduardo Saverin, Andrew McCollum, Dustin Moskovitz and Chris Hughes. Zuckerberg took the company public in May 2012 with majority shares. He became the worlds youngest self-made billionaire[a] in 2008, at age 23, and has consistently ranked among the worlds wealthiest individuals. According to Forbes, Zuckerbergs estimated net worth stood at US$221.2 billion as of May 2025, making him the second-richest individual in the world.[2]"],
204+
"max_tokens": 2,
205+
"temperature": 0
206+
}'
207+
#sleep 10000
208+
# Run lm eval for this model
209+
echo "Running tests for $model_name"
210+
TEST_MODEL=$model_name python -m pytest -s -x test_accuracy.py
211+
212+
# Clean up before running next model
213+
cleanup_instances
214+
sleep 3
137215
}
138216

217+
# Run tests for each model
218+
for model in "${MODELS[@]}"; do
219+
run_tests_for_model "$model"
220+
done
139221

140-
# run non-disagg. baseline & save outputs
141-
launch_baseline
142-
sleep 2
143-
wait_for_server ${BASELINE_HOST} ${BASELINE_PORT}
144-
run_tests "http://${BASELINE_HOST}:${BASELINE_PORT}" "baseline"
145-
cleanup
146-
sleep 10
147-
148-
149-
# run disagg. & do exact-match with the outputs from baseline
150-
launch_pd
151-
launch_pd_proxy
152-
sleep 10
153-
run_tests "http://${PROXY_HOST}:${PROXY_PORT}" "disagg"
154-
echo "-----P/D success----"
155-
156-
rm ${OUTPUT_FILE}
157-
cleanup
158-
159-
exit 0
222+
echo "All tests completed!"

0 commit comments

Comments
 (0)