integration test

NickLucche · NickLucche · commit c59ca5218fb2 · 2025-05-20T07:45:18.000Z
Signed-off-by: nicklucche &lt;nlucches@redhat.com&gt;
diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
@@ -44,6 +44,39 @@ get_model_args() {
   echo "$extra_args"
 }
 
+set_cli_args() {
+  PREFILLER_TP_SIZE=1
+  DECODER_TP_SIZE=1
+  # Iterate through the rest of the arguments
+  while [[ $# -gt 0 ]]; do
+    echo $#
+    case "$1" in
+      --prefiller-tp-size)
+        if [[ -n "$2" ]]; then
+          PREFILLER_TP_SIZE="$2"
+          shift 2 # Consume the flag and its value ($2)
+        else
+          echo "Error: --prefiller-tp-size requires a value." >&2
+          exit 1
+        fi
+        ;;
+      --decoder-tp-size)
+        if [[ -n "$2" ]]; then
+          DECODER_TP_SIZE="$2"
+          shift 2
+        else
+          echo "Error: --decoder-tp-size requires a value." >&2
+          exit 1
+        fi
+        ;;
+      *)
+        # Handle any arguments not recognized
+        shift # Ignore unknown argument
+        ;;
+    esac
+  done
+}
+
 
 # Function to run tests for a specific model
 run_tests_for_model() {
@@ -54,6 +87,7 @@ run_tests_for_model() {
 
   # Get model-specific arguments
   local model_args=$(get_model_args "$model_name")
+  set_cli_args "$@"
 
   # Arrays to store all hosts and ports
   PREFILL_HOSTS=()
@@ -65,19 +99,21 @@ run_tests_for_model() {
   for i in $(seq 0 $((NUM_PREFILL_INSTANCES-1))); do
     # Calculate GPU ID - we'll distribute across available GPUs
     GPU_ID=$((i % $(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)))
+
     # Calculate port number (base port + instance number)
     PORT=$((8100 + i))
-    # Calculate side channel port
-    SIDE_CHANNEL_PORT=$((5559 + i))
+    # Calculate side channel port. Avoid clash with with TP workers. 
+    SIDE_CHANNEL_PORT=$((5559 + i * $PREFILLER_TP_SIZE))
 
     echo "Starting prefill instance $i on GPU $GPU_ID, port $PORT"
 
     # Build the command with or without model-specific args
-    BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
+    BASE_CMD="VLLM_WORKER_MULTIPROC_METHOD=spawn VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
     --port $PORT \
     --enforce-eager \
     --disable-log-requests \
     --gpu-memory-utilization 0.2 \
+    --tensor-parallel-size $PREFILLER_TP_SIZE \
     --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
 
     if [ -n "$model_args" ]; then
@@ -97,19 +133,21 @@ run_tests_for_model() {
   for i in $(seq 0 $((NUM_DECODE_INSTANCES-1))); do
     # Calculate GPU ID - we'll distribute across available GPUs, starting from after prefill GPUs
     GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)))
+
     # Calculate port number (base port + instance number)
     PORT=$((8200 + i))
     # Calculate side channel port
-    SIDE_CHANNEL_PORT=$((5659 + i))
+    SIDE_CHANNEL_PORT=$((5659 + i * $PREFILLER_TP_SIZE))
 
     echo "Starting decode instance $i on GPU $GPU_ID, port $PORT"
 
     # Build the command with or without model-specific args
-    BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
+    BASE_CMD="VLLM_WORKER_MULTIPROC_METHOD=spawn VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
     --port $PORT \
     --enforce-eager \
     --disable-log-requests \
     --gpu-memory-utilization 0.2 \
+    --tensor-parallel-size $DECODER_TP_SIZE \
     --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
 
     if [ -n "$model_args" ]; then
@@ -165,7 +203,7 @@ run_tests_for_model() {
 
 # Run tests for each model
 for model in "${MODELS[@]}"; do
-  run_tests_for_model "$model"
+  run_tests_for_model "$model" "$@"
 done
 
 echo "All tests completed!"
diff --git a/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py b/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
@@ -151,18 +151,8 @@ async def send_request_to_service(client_info: dict, endpoint: str,
     Send a request to a service using a client from the pool.
     """
     req_data = req_data.copy()
-    req_data['kv_transfer_params'] = {
-        "do_remote_decode": True,
-        "do_remote_prefill": False,
-        "remote_engine_id": None,
-        "remote_block_ids": None,
-        "remote_host": None,
-        "remote_port": None
-    }
+    req_data['do_remote_decode'] = True
     req_data["stream"] = False
-    req_data["max_tokens"] = 1
-    if "stream_options" in req_data:
-        del req_data["stream_options"]
     headers = {
         "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
         "X-Request-Id": request_id
@@ -177,14 +167,22 @@ async def send_request_to_service(client_info: dict, endpoint: str,
 
 
 async def stream_service_response(client_info: dict, endpoint: str,
-                                  req_data: dict, request_id: str):
+                                  req_data: dict, remote_block_ids: list[int],
+                                  remote_engine_id: str, remote_host: str,
+                                  remote_port: int, request_id: str):
     """
     Asynchronously stream response from a service using a client from the pool.
     """
     headers = {
         "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
         "X-Request-Id": request_id
     }
+    req_data = req_data.copy()
+    req_data['do_remote_prefill'] = True
+    req_data["remote_block_ids"] = remote_block_ids
+    req_data['remote_engine_id'] = remote_engine_id
+    req_data["remote_host"] = remote_host
+    req_data["remote_port"] = remote_port
 
     async with client_info['client'].stream("POST",
                                             endpoint,
@@ -211,9 +209,10 @@ async def handle_completions(request: Request):
 
         # Extract the needed fields
         response_json = response.json()
-        kv_transfer_params = response_json.get('kv_transfer_params', {})
-        if kv_transfer_params:
-            req_data["kv_transfer_params"] = kv_transfer_params
+        remote_block_ids = response_json.get('remote_block_ids', [])
+        remote_engine_id = response_json.get('remote_engine_id', '')
+        remote_host = response_json.get('remote_host', '')
+        remote_port = response_json.get('remote_port', 0)
 
         # Get the next decode client in round-robin fashion
         decode_client_info = get_next_client(request.app, 'decode')
@@ -222,10 +221,15 @@ async def handle_completions(request: Request):
 
         # Stream response from decode service
         async def generate_stream():
-            async for chunk in stream_service_response(decode_client_info,
-                                                       "/completions",
-                                                       req_data,
-                                                       request_id=request_id):
+            async for chunk in stream_service_response(
+                    decode_client_info,
+                    "/completions",
+                    req_data,
+                    remote_block_ids=remote_block_ids,
+                    remote_engine_id=remote_engine_id,
+                    remote_host=remote_host,
+                    remote_port=remote_port,
+                    request_id=request_id):
                 yield chunk
 
         return StreamingResponse(generate_stream(),