NickLucche · ptarasiewiczNV · May 10, 2025 · May 13, 2025 · May 13, 2025 · May 13, 2025
diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
@@ -44,6 +44,39 @@ get_model_args() {
   echo "$extra_args"
 }
 
+set_cli_args() {
+  PREFILLER_TP_SIZE=1
+  DECODER_TP_SIZE=1
+  # Iterate through the rest of the arguments
+  while [[ $# -gt 0 ]]; do
+    echo $#
+    case "$1" in
+      --prefiller-tp-size)
+        if [[ -n "$2" ]]; then
+          PREFILLER_TP_SIZE="$2"
+          shift 2 # Consume the flag and its value ($2)
+        else
+          echo "Error: --prefiller-tp-size requires a value." >&2
+          exit 1
+        fi
+        ;;
+      --decoder-tp-size)
+        if [[ -n "$2" ]]; then
+          DECODER_TP_SIZE="$2"
+          shift 2
+        else
+          echo "Error: --decoder-tp-size requires a value." >&2
+          exit 1
+        fi
+        ;;
+      *)
+        # Handle any arguments not recognized
+        shift # Ignore unknown argument
+        ;;
+    esac
+  done
+}
+
 
 # Function to run tests for a specific model
 run_tests_for_model() {
@@ -54,6 +87,7 @@ run_tests_for_model() {
 
   # Get model-specific arguments
   local model_args=$(get_model_args "$model_name")
+  set_cli_args "$@"
 
   # Arrays to store all hosts and ports
   PREFILL_HOSTS=()
@@ -64,20 +98,31 @@ run_tests_for_model() {
   # Start prefill instances
   for i in $(seq 0 $((NUM_PREFILL_INSTANCES-1))); do
     # Calculate GPU ID - we'll distribute across available GPUs
-    GPU_ID=$((i % $(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)))
+    # For tensor parallelism, we need to assign multiple consecutive GPUs
+    BASE_GPU_ID=$(((i * $PREFILLER_TP_SIZE) % $(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)))
+    # Create a comma-separated list of GPU IDs for tensor parallelism
+    GPU_IDS=""
+    for ((j=0; j<$PREFILLER_TP_SIZE; j++)); do
+        if [ $j -gt 0 ]; then
+            GPU_IDS+=","
+        fi
+        GPU_IDS+="$((BASE_GPU_ID + j))"
+    done
+
     # Calculate port number (base port + instance number)
     PORT=$((8100 + i))
-    # Calculate side channel port
-    SIDE_CHANNEL_PORT=$((5559 + i))
+    # Calculate side channel port. Avoid clash with with TP workers. 
+    SIDE_CHANNEL_PORT=$((5559 + i * $PREFILLER_TP_SIZE))
 
-    echo "Starting prefill instance $i on GPU $GPU_ID, port $PORT"
+    echo "Starting prefill instance $i on GPUs $GPU_IDS, port $PORT"
 
     # Build the command with or without model-specific args
-    BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
+    BASE_CMD="VLLM_WORKER_MULTIPROC_METHOD=spawn VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=$GPU_IDS VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
     --port $PORT \
     --enforce-eager \
     --disable-log-requests \
     --gpu-memory-utilization 0.2 \
+    --tensor-parallel-size $PREFILLER_TP_SIZE \
     --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
 
     if [ -n "$model_args" ]; then
@@ -96,20 +141,31 @@ run_tests_for_model() {
   # Start decode instances
   for i in $(seq 0 $((NUM_DECODE_INSTANCES-1))); do
     # Calculate GPU ID - we'll distribute across available GPUs, starting from after prefill GPUs
-    GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)))
+    # For tensor parallelism, we need to assign multiple consecutive GPUs
+    BASE_GPU_ID=$(((i * $DECODER_TP_SIZE + $NUM_PREFILL_INSTANCES * $PREFILLER_TP_SIZE) % $(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)))
+    # Create a comma-separated list of GPU IDs for tensor parallelism
+    GPU_IDS=""
+    for ((j=0; j<$DECODER_TP_SIZE; j++)); do
+        if [ $j -gt 0 ]; then
+            GPU_IDS+=","
+        fi
+        GPU_IDS+="$((BASE_GPU_ID + j))"
+    done
+
     # Calculate port number (base port + instance number)
     PORT=$((8200 + i))
     # Calculate side channel port
-    SIDE_CHANNEL_PORT=$((5659 + i))
+    SIDE_CHANNEL_PORT=$((5659 + i * $DECODER_TP_SIZE + $NUM_PREFILL_INSTANCES * $PREFILLER_TP_SIZE))
 
-    echo "Starting decode instance $i on GPU $GPU_ID, port $PORT"
+    echo "Starting decode instance $i on GPUs $GPU_IDS, port $PORT"
 
     # Build the command with or without model-specific args
-    BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
+    BASE_CMD="VLLM_WORKER_MULTIPROC_METHOD=spawn VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=$GPU_IDS VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
     --port $PORT \
     --enforce-eager \
     --disable-log-requests \
     --gpu-memory-utilization 0.2 \
+    --tensor-parallel-size $DECODER_TP_SIZE \
     --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
 
     if [ -n "$model_args" ]; then
@@ -165,7 +221,7 @@ run_tests_for_model() {
 
 # Run tests for each model
 for model in "${MODELS[@]}"; do
-  run_tests_for_model "$model"
+  run_tests_for_model "$model" "$@"
 done
 
 echo "All tests completed!"