Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 66 additions & 10 deletions tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,39 @@ get_model_args() {
echo "$extra_args"
}

set_cli_args() {
PREFILLER_TP_SIZE=1
DECODER_TP_SIZE=1
# Iterate through the rest of the arguments
while [[ $# -gt 0 ]]; do
echo $#
case "$1" in
--prefiller-tp-size)
if [[ -n "$2" ]]; then
PREFILLER_TP_SIZE="$2"
shift 2 # Consume the flag and its value ($2)
else
echo "Error: --prefiller-tp-size requires a value." >&2
exit 1
fi
;;
--decoder-tp-size)
if [[ -n "$2" ]]; then
DECODER_TP_SIZE="$2"
shift 2
else
echo "Error: --decoder-tp-size requires a value." >&2
exit 1
fi
;;
*)
# Handle any arguments not recognized
shift # Ignore unknown argument
;;
esac
done
}


# Function to run tests for a specific model
run_tests_for_model() {
Expand All @@ -54,6 +87,7 @@ run_tests_for_model() {

# Get model-specific arguments
local model_args=$(get_model_args "$model_name")
set_cli_args "$@"

# Arrays to store all hosts and ports
PREFILL_HOSTS=()
Expand All @@ -64,20 +98,31 @@ run_tests_for_model() {
# Start prefill instances
for i in $(seq 0 $((NUM_PREFILL_INSTANCES-1))); do
# Calculate GPU ID - we'll distribute across available GPUs
GPU_ID=$((i % $(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)))
# For tensor parallelism, we need to assign multiple consecutive GPUs
BASE_GPU_ID=$(((i * $PREFILLER_TP_SIZE) % $(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)))
# Create a comma-separated list of GPU IDs for tensor parallelism
GPU_IDS=""
for ((j=0; j<$PREFILLER_TP_SIZE; j++)); do
if [ $j -gt 0 ]; then
GPU_IDS+=","
fi
GPU_IDS+="$((BASE_GPU_ID + j))"
done

# Calculate port number (base port + instance number)
PORT=$((8100 + i))
# Calculate side channel port
SIDE_CHANNEL_PORT=$((5559 + i))
# Calculate side channel port. Avoid clash with with TP workers.
SIDE_CHANNEL_PORT=$((5559 + i * $PREFILLER_TP_SIZE))

echo "Starting prefill instance $i on GPU $GPU_ID, port $PORT"
echo "Starting prefill instance $i on GPUs $GPU_IDS, port $PORT"

# Build the command with or without model-specific args
BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
BASE_CMD="VLLM_WORKER_MULTIPROC_METHOD=spawn VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=$GPU_IDS VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
--port $PORT \
--enforce-eager \
--disable-log-requests \
--gpu-memory-utilization 0.2 \
--tensor-parallel-size $PREFILLER_TP_SIZE \
--kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"

if [ -n "$model_args" ]; then
Expand All @@ -96,20 +141,31 @@ run_tests_for_model() {
# Start decode instances
for i in $(seq 0 $((NUM_DECODE_INSTANCES-1))); do
# Calculate GPU ID - we'll distribute across available GPUs, starting from after prefill GPUs
GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)))
# For tensor parallelism, we need to assign multiple consecutive GPUs
BASE_GPU_ID=$(((i * $DECODER_TP_SIZE + $NUM_PREFILL_INSTANCES * $PREFILLER_TP_SIZE) % $(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)))
# Create a comma-separated list of GPU IDs for tensor parallelism
GPU_IDS=""
for ((j=0; j<$DECODER_TP_SIZE; j++)); do
if [ $j -gt 0 ]; then
GPU_IDS+=","
fi
GPU_IDS+="$((BASE_GPU_ID + j))"
done

# Calculate port number (base port + instance number)
PORT=$((8200 + i))
# Calculate side channel port
SIDE_CHANNEL_PORT=$((5659 + i))
SIDE_CHANNEL_PORT=$((5659 + i * $DECODER_TP_SIZE + $NUM_PREFILL_INSTANCES * $PREFILLER_TP_SIZE))

echo "Starting decode instance $i on GPU $GPU_ID, port $PORT"
echo "Starting decode instance $i on GPUs $GPU_IDS, port $PORT"

# Build the command with or without model-specific args
BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
BASE_CMD="VLLM_WORKER_MULTIPROC_METHOD=spawn VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=$GPU_IDS VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
--port $PORT \
--enforce-eager \
--disable-log-requests \
--gpu-memory-utilization 0.2 \
--tensor-parallel-size $DECODER_TP_SIZE \
--kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"

if [ -n "$model_args" ]; then
Expand Down Expand Up @@ -165,7 +221,7 @@ run_tests_for_model() {

# Run tests for each model
for model in "${MODELS[@]}"; do
run_tests_for_model "$model"
run_tests_for_model "$model" "$@"
done

echo "All tests completed!"
Loading