add test scripts to examples

hsubramony · hsubramony · commit 6a5c7ddbe2e4 · 2025-08-07T00:15:00.000Z
diff --git a/examples/lmcache/hpu/README.md b/examples/lmcache/hpu/README.md
@@ -0,0 +1,74 @@
+# LMCache Examples
+Please Note: HPU integration for LMCache will be upstreamed. After that, the following test cases can be used.
+
+This folder demonstrates how to use LMCache for disaggregated prefilling, CPU offloading and KV cache sharing.
+
+## 1. Disaggregated Prefill in vLLM v1
+
+This example demonstrates how to run LMCache with disaggregated prefill using lm or redis on a single node.
+
+### Prerequisites
+- At least 2 HPU cards
+- Valid Hugging Face token (HF_TOKEN) for Llama 3.1 8B Instruct
+- https://github.com/LMCache/LMCache/pull/1066 needed for lmcache
+
+### Usage
+
+Run
+`cd disagg_prefill_lmcache_v1`
+to get into `disagg_prefill_lmcache_v1` folder, and then run
+
+```bash
+PT_HPU_GPU_MIGRATION=1 VLLM_USE_V1=1 VLLM_SKIP_WARMUP=True PT_HPU_ENABLE_LAZY_COLLECTIVES=true bash disagg_example.sh
+```
+
+to run disaggregated prefill and benchmark the performance.
+
+lmserver is default and it's configurable as well as tensor_parallel_size and model name.
+
+Example) redis server, tensor_parallel_size 4 and Llama-3.1-70B-Instruct model
+
+```
+PT_HPU_GPU_MIGRATION=1 VLLM_USE_V1=1 VLLM_SKIP_WARMUP=True PT_HPU_ENABLE_LAZY_COLLECTIVES=true bash disagg_example.sh -s redis -t 4 -m meta-llama/Llama-3.1-70B-Instruct
+```
+
+### Components
+
+#### Server Scripts
+- `disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh` - Launches individual vLLM servers for prefill/decode, and also launches the proxy server.
+- `../disagg_prefill_lmcache_v1/disagg_proxy_server.py` - FastAPI proxy server that coordinates between prefiller and decoder
+- `disagg_prefill_lmcache_v1/disagg_example.sh` - Main script to run the example through lm/redis remote server
+
+#### Configuration
+- `disagg_prefill_lmcache_v1/configs/lmcache-config-lm.yaml` - Configuration for prefiller/decoder server through lm server
+- `disagg_prefill_lmcache_v1/configs/lmcache-config-redis.yaml` - Configuration for prefill/decoder server through redis server
+
+#### Log Files
+The main script generates several log files:
+- `prefiller.log` - Logs from the prefill server
+- `decoder.log` - Logs from the decode server
+- `proxy.log` - Logs from the proxy server
+
+## 2. KV Cache Sharing
+
+The `kv_cache_sharing_lmcache_v1.py` example demonstrates how to share KV caches between vLLM v1 instances.
+
+### Usage
+
+```bash
+PT_HPU_GPU_MIGRATION=1 VLLM_USE_V1=1 VLLM_SKIP_WARMUP=True PT_HPU_ENABLE_LAZY_COLLECTIVES=true python kv_cache_sharing_lmcache_v1.py
+```
+
+lmserver is default and it's configurable as well as tensor_parallel_size.
+
+Example 1) redis server with port 6380
+
+```bash
+PT_HPU_GPU_MIGRATION=1 VLLM_USE_V1=1 VLLM_SKIP_WARMUP=True PT_HPU_ENABLE_LAZY_COLLECTIVES=true python kv_cache_sharing_lmcache_v1.py --remote_server redis --redis_port 6380
+```
+
+Example 2) lmserver with port 8108 and tensor_parallel_size 2
+
+```bash
+PT_HPU_GPU_MIGRATION=1 VLLM_USE_V1=1 VLLM_SKIP_WARMUP=True PT_HPU_ENABLE_LAZY_COLLECTIVES=true python kv_cache_sharing_lmcache_v1.py --lm_port 8108 --tp_size 2
+```
diff --git a/examples/lmcache/hpu/disagg_prefill_lmcache_v1/configs/lmcache-config-lm.yaml b/examples/lmcache/hpu/disagg_prefill_lmcache_v1/configs/lmcache-config-lm.yaml
@@ -0,0 +1,7 @@
+chunk_size: 256
+local_cpu: False
+max_local_cpu_size: 5.0
+#local_disk: 
+max_local_disk_size: 0
+remote_serde: naive
+remote_url: "lm://localhost:8100"
diff --git a/examples/lmcache/hpu/disagg_prefill_lmcache_v1/configs/lmcache-config-redis.yaml b/examples/lmcache/hpu/disagg_prefill_lmcache_v1/configs/lmcache-config-redis.yaml
@@ -0,0 +1,7 @@
+chunk_size: 256
+local_cpu: False
+max_local_cpu_size: 5.0
+#local_disk: 
+max_local_disk_size: 0
+remote_serde: naive
+remote_url: "redis://localhost:6379"
diff --git a/examples/lmcache/hpu/disagg_prefill_lmcache_v1/disagg_example.sh b/examples/lmcache/hpu/disagg_prefill_lmcache_v1/disagg_example.sh
@@ -0,0 +1,178 @@
+#!/bin/bash
+
+echo "Warning: LMCache disaggregated prefill support for vLLM v1 is experimental and subject to change."
+
+#!/bin/bash
+
+usage() {
+    echo``
+    echo "Runs simple request check on multimodal models using vllm"
+    echo
+    echo "usage: ${0} <options>"
+    echo
+    echo "  -s    - remote_server (redis/lm). default:lm"
+    echo "  -t    - tensor parallel size. default:1"
+    echo "  -m    - model. default:meta-llama/Llama-3.1-8B-Instruct"
+    echo
+}
+
+PIDS=()
+
+# Switch to the directory of the current script
+cd "$(dirname "${BASH_SOURCE[0]}")"
+
+check_hf_token() {
+    if [ -z "$HF_TOKEN" ]; then
+        echo "HF_TOKEN is not set. Please set it to your Hugging Face token."
+        exit 1
+    fi
+    if [[ "$HF_TOKEN" != hf_* ]]; then
+        echo "HF_TOKEN is not a valid Hugging Face token. Please set it to your Hugging Face token."
+        exit 1
+    fi
+    echo "HF_TOKEN is set and valid."
+}
+
+check_num_gpus() {
+    # can you check if the number of GPUs are >=2 via nvidia-smi?
+    num_gpus=$(hl-smi --query-gpu=name --format=csv,noheader | wc -l)
+    if [ "$num_gpus" -lt 2 ]; then
+        echo "You need at least 2 GPUs to run disaggregated prefill."
+        exit 1
+    else
+        echo "Found $num_gpus GPUs."
+    fi
+}
+
+ensure_python_library_installed() {
+    echo "Checking if $1 is installed..."
+    python -c "import $1" > /dev/null 2>&1
+    if [ $? -ne 0 ]; then
+        if [ "$1" == "nixl" ]; then
+            echo "$1 is not installed. Please refer to https://github.com/ai-dynamo/nixl for installation."
+        else
+            echo "$1 is not installed. Please install it via pip install $1."
+        fi
+        exit 1
+    else
+        echo "$1 is installed."
+    fi
+}
+
+cleanup() {
+    echo "Stopping everything…"
+    trap - INT TERM        # prevent re-entrancy
+    kill -- -$$            # negative PID  ==  “this whole process-group”
+    wait                   # reap children so we don't leave zombies
+    exit 0
+}
+
+wait_for_server() {
+  local port=$1
+  local timeout_seconds=1200
+  local start_time=$(date +%s)
+
+  echo "Waiting for server on port $port..."
+
+  while true; do
+    if curl -s "localhost:${port}/v1/completions" > /dev/null; then
+      return 0
+    fi
+
+    local now=$(date +%s)
+    if (( now - start_time >= timeout_seconds )); then
+      echo "Timeout waiting for server"
+      return 1
+    fi
+
+    sleep 1
+  done
+}
+
+
+SERVER="lm"
+TP_SIZE=1
+MODEL="llama3.1/Meta-Llama-3.1-8B-Instruct"
+
+main() {
+    while [[ "$#" -gt 0 ]]; do
+        case $1 in
+            -s) SERVER="$2"; shift ;;
+            -t) TP_SIZE="$2"; shift ;;
+            -m) MODEL="$2"; shift ;;
+            *) echo "Unknown parameter passed: $1"; exit 1 ;;
+        esac
+        shift
+    done
+
+    echo "server: $SERVER"
+    echo "tensor parallel size: $TP_SIZE"
+    echo "model: $MODEL"
+
+    #check_hf_token
+    check_num_gpus
+    ensure_python_library_installed lmcache
+    ensure_python_library_installed pandas
+    ensure_python_library_installed datasets
+    ensure_python_library_installed vllm
+
+    trap cleanup INT
+    trap cleanup USR1
+    trap cleanup TERM    
+
+    echo "Launching prefiller, decoder and proxy..."
+    echo "Please check prefiller.log, decoder.log and proxy.log for logs."
+
+    if [[ $SERVER == "lm" ]]; then
+        echo "starting lmcache "
+        python -m lmcache.v1.server localhost 8100 2>&1 &
+    elif [[ $SERVER == "redis" ]]; then
+        echo "starting redis-server "
+        redis-server --port 6379 &
+    else
+        echo "Invalid server: $SERVER"
+        exit 1
+    fi
+
+    echo "start prefiller "
+    bash disagg_vllm_launcher.sh prefiller $SERVER $TP_SIZE $MODEL \
+        > >(tee prefiller.log) 2>&1 &
+    prefiller_pid=$!
+    PIDS+=($prefiller_pid)
+    echo "start decoder "
+    bash disagg_vllm_launcher.sh decoder $SERVER $TP_SIZE $MODEL \
+        > >(tee decoder.log)  2>&1 &
+    decoder_pid=$!
+    PIDS+=($decoder_pid)
+
+    python3 ../../disagg_prefill_lmcache_v1/disagg_proxy_server.py \
+        --host localhost \
+        --port 1000 \
+        --prefiller-host localhost \
+        --prefiller-port 1100 \
+        --decoder-host localhost \
+        --decoder-port 1200  \
+        > >(tee proxy.log)    2>&1 &
+    proxy_pid=$!
+    PIDS+=($proxy_pid)
+
+    wait_for_server 1100
+    wait_for_server 1200
+    wait_for_server 1000
+
+    echo "All servers are up. Starting benchmark..."
+
+    # begin benchmark
+    cd ../../../../../benchmarks/
+    python benchmark_serving.py  --port 1000 --seed 12345 \
+        --model $MODEL \
+        --dataset-name random --random-input-len 8000 --random-output-len 200 \
+        --num-prompts 200 --burstiness 100 --request-rate 3.6 | tee benchmark.log
+
+    echo "Benchmarking done. Cleaning up..."
+
+    cleanup
+
+}
+
+main "$@"
diff --git a/examples/lmcache/hpu/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh b/examples/lmcache/hpu/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+if [[ $# -lt 1 ]]; then
+    echo "Usage: $0 <prefiller | decoder> [server] [tp] [model]"
+    exit 1
+fi
+
+SERVER="lm"
+TP_SIZE=1
+MODEL="llama3.1/Meta-Llama-3.1-8B-Instruct"
+
+if [[ $# -eq 1 ]]; then
+    echo "Using default server: $SERVER"
+    echo "Using default tp: $TP_SIZE"
+    echo "Using default model: $MODEL"
+else
+    SERVER=$2
+    TP_SIZE=$3
+    MODEL=$4
+    echo "Using server: $SERVER"
+    echo "Using tp: $TP_SIZE"
+    echo "Using model: $MODEL"
+fi
+
+
+if [[ $1 == "prefiller" ]]; then
+    if [[ $SERVER == "lm" ]]; then
+        # Prefiller listens on port 8100
+        prefill_config_file=$SCRIPT_DIR/configs/lmcache-config-lm.yaml
+    elif [[ $SERVER == "redis" ]]; then
+        # Prefiller listens on port 6379
+        prefill_config_file=$SCRIPT_DIR/configs/lmcache-config-redis.yaml
+    else
+        echo "Invalid server: $2"
+        exit 1
+    fi
+
+    #UCX_TLS=tcp \
+    LMCACHE_CONFIG_FILE=$prefill_config_file \
+        VLLM_ENABLE_V1_MULTIPROCESSING=1 \
+        VLLM_WORKER_MULTIPROC_METHOD=spawn \
+        RANK=0 \
+        vllm serve $MODEL \
+        --port 1100 \
+        --disable-log-requests \
+        --tensor_parallel_size $TP_SIZE \
+        --kv-transfer-config \
+        '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_producer","kv_connector_extra_config": {"discard_partial_chunks": false, "lmcache_rpc_port": "producer1"}}'
+
+
+elif [[ $1 == "decoder" ]]; then
+    if [[ $SERVER == "lm" ]]; then
+        # Decoder listens on port 8100
+        decode_config_file=$SCRIPT_DIR/configs/lmcache-config-lm.yaml
+    elif [[ $SERVER == "redis" ]]; then
+        # Decoder listens on port 6379
+        decode_config_file=$SCRIPT_DIR/configs/lmcache-config-redis.yaml
+    else
+        echo "Invalid server: $2"
+        exit 1
+    fi
+
+    #UCX_TLS=tcp \
+    LMCACHE_CONFIG_FILE=$decode_config_file \
+        VLLM_ENABLE_V1_MULTIPROCESSING=1 \
+        VLLM_WORKER_MULTIPROC_METHOD=spawn \
+        RANK=1 \
+        vllm serve $MODEL \
+        --port 1200 \
+        --disable-log-requests \
+        --tensor_parallel_size $TP_SIZE \
+        --kv-transfer-config \
+        '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_consumer","kv_connector_extra_config": {"discard_partial_chunks": false, "lmcache_rpc_port": "consumer1"}}'
+
+
+else
+    echo "Invalid role: $1"
+    echo "Should be either prefill, decode"
+    exit 1
+fi
diff --git a/examples/lmcache/hpu/kv_cache_sharing_lmcache_v1.py b/examples/lmcache/hpu/kv_cache_sharing_lmcache_v1.py