diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index ddb38e304cd6..0fdd134af95d 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -295,6 +295,7 @@ async def async_request_openai_completions(
                 url=api_url, json=payload, headers=headers
             ) as response:
                 if response.status == 200:
+
                     first_chunk_received = False
                     async for chunk_bytes in response.content:
                         chunk_bytes = chunk_bytes.strip()
@@ -318,7 +319,8 @@ async def async_request_openai_completions(
                                     first_chunk_received = True
                                     ttft = time.perf_counter() - st
                                     output.ttft = ttft
-
+                                    #print(f'libin debug backend request {ttft=}')
+                                    sys.stdout.flush()
                                 # Decoding phase
                                 else:
                                     output.itl.append(timestamp - most_recent_timestamp)
diff --git a/requirements/hpu.txt b/requirements/hpu.txt
index bd4189ef951a..8d25828248f0 100644
--- a/requirements/hpu.txt
+++ b/requirements/hpu.txt
@@ -7,7 +7,7 @@ ray
 triton==3.1.0
 setuptools>=77.0.3
 setuptools-scm>=8
-vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@6b2f6fb
+vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@ce0e48
 
 # Dependencies for HPU vllm docker image
 datasets
diff --git a/tests/v1/kv_connector/nixl_integration/requirements.txt b/tests/v1/kv_connector/nixl_integration/requirements.txt
new file mode 100644
index 000000000000..5a728f17a419
--- /dev/null
+++ b/tests/v1/kv_connector/nixl_integration/requirements.txt
@@ -0,0 +1,4 @@
+pytest
+nixl==0.5.0
+lm-eval
+lm-eval[api]
diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
index b48655d80eef..892ba4fbee0a 100755
--- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
+++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
@@ -5,6 +5,13 @@ set -xe
 MODELS=(
     "Qwen/Qwen3-0.6B"
 )
+#MODELS=(
+#	"meta-llama/Llama-3.1-8B"
+#)
+
+export VLLM_USE_V1=1
+export VLLM_SKIP_WARMUP="true"
+export PT_HPU_LAZY_MODE=1
 
 # Number of prefill and decode instances to create
 NUM_PREFILL_INSTANCES=${NUM_PREFILL_INSTANCES:-1} # Default to 1
@@ -13,9 +20,10 @@ PREFILLER_TP_SIZE=${PREFILLER_TP_SIZE:-1}
 DECODER_TP_SIZE=${DECODER_TP_SIZE:-1}
 
 # Find the git repository root directory
-GIT_ROOT=$(git rev-parse --show-toplevel)
+#GIT_ROOT=$(git rev-parse --show-toplevel)
+GIT_ROOT="/home/vllm-nixl/vllm"
 
-SMI_BIN=$(which nvidia-smi || which rocm-smi)
+#SMI_BIN=$(which nvidia-smi || which rocm-smi)
 
 # Trap the SIGINT signal (triggered by Ctrl+C)
 trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT
@@ -25,7 +33,7 @@ wait_for_server() {
   local port=$1
   timeout 1200 bash -c "
     until curl -s localhost:${port}/v1/completions > /dev/null; do
-      sleep 1
+      sleep 1  
     done" && return 0 || return 1
 }
 
@@ -75,23 +83,24 @@ run_tests_for_model() {
   # Start prefill instances
   for i in $(seq 0 $((NUM_PREFILL_INSTANCES-1))); do
     # Calculate GPU ID - we'll distribute across available GPUs
-    GPU_ID=$((i % $(get_num_gpus)))
+    #GPU_ID=$((i % $(get_num_gpus)))
+    GPU_ID=2
 
     # Calculate port number (base port + instance number)
-    PORT=$((8100 + i))
+    PORT=$((8300 + i))
     # Calculate side channel port. Avoid clash with with TP workers. 
     SIDE_CHANNEL_PORT=$((5559 + i))
 
     echo "Starting prefill instance $i on GPU $GPU_ID, port $PORT"
 
     # Build the command with or without model-specific args
-    BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
+    BASE_CMD="RANK=0 UCX_TLS=tcp VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
     --port $PORT \
     --enforce-eager \
-    --disable-log-requests \
-    --gpu-memory-utilization 0.2 \
+    --max_num_batched_tokens 8192 \
+    --gpu-memory-utilization 0.3 \
     --tensor-parallel-size $PREFILLER_TP_SIZE \
-    --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
+    --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
 
     if [ -n "$model_args" ]; then
     FULL_CMD="$BASE_CMD $model_args"
@@ -109,22 +118,22 @@ run_tests_for_model() {
   # Start decode instances
   for i in $(seq 0 $((NUM_DECODE_INSTANCES-1))); do
     # Calculate GPU ID - we'll distribute across available GPUs, starting from after prefill GPUs
-    GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(get_num_gpus)))
+    #GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(get_num_gpus)))
     # Calculate port number (base port + instance number)
-    PORT=$((8200 + i))
+    PORT=$((8400 + i))
     # Calculate side channel port
     SIDE_CHANNEL_PORT=$((5659 + i * $DECODER_TP_SIZE))
 
     echo "Starting decode instance $i on GPU $GPU_ID, port $PORT"
 
     # Build the command with or without model-specific args
-    BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
+    BASE_CMD="RANK=1 UCX_TLS=tcp VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
     --port $PORT \
     --enforce-eager \
-    --disable-log-requests \
-    --gpu-memory-utilization 0.2 \
+    --max_num_batched_tokens 8192 \
+    --gpu-memory-utilization 0.3 \
     --tensor-parallel-size $DECODER_TP_SIZE \
-    --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
+    --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
 
     if [ -n "$model_args" ]; then
     FULL_CMD="$BASE_CMD $model_args"
@@ -151,7 +160,7 @@ run_tests_for_model() {
   done
 
   # Build the command for the proxy server with all the hosts and ports
-  PROXY_CMD="python ${GIT_ROOT}/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --port 8192"
+  PROXY_CMD="python toy_proxy_server.py --port 9192"
 
   # Add all prefill hosts and ports
   PROXY_CMD+=" --prefiller-hosts ${PREFILL_HOSTS[@]}"
@@ -166,11 +175,39 @@ run_tests_for_model() {
   $PROXY_CMD &
 
   # Wait for the proxy to start
-  sleep 5
-
+  sleep 10
+  
+# curl -X POST -s http://localhost:9192/v1/completions \
+#	-H "Content-Type: application/json" \
+#	-d '{
+#	"model": "meta-llama/Llama-3.1-8B",
+#	"prompt": "Mark Elliot Zuckerberg is an American businessman who co-founded the social media service Facebook and its parent company Meta Platforms, of which he is the chairman, chief executive officer, and controlling shareholder. Zuckerberg has been the subject of multiple lawsuits regarding the creation and ownership of the website as well as issues such as user privacy. Born in White Plains, New York, Zuckerberg briefly attended Harvard College, where he launched Facebook in February 2004 with his roommates Eduardo Saverin, Andrew McCollum, Dustin Moskovitz and Chris Hughes. Zuckerberg took the company public in May 2012 with majority shares. He became the worlds youngest self-made billionaire[a] in 2008, at age 23, and has consistently ranked among the worlds wealthiest individuals. According to Forbes, Zuckerbergs estimated net worth stood at US$221.2 billion as of May 2025, making him the second-richest individual in the world.[2]",
+#	"max_tokens": 5,
+#	"temperature": 0
+#	}'
+	sleep 5
+	echo "--------------------===================-------------"
+#curl -X POST -s http://localhost:9192/v1/completions \
+#        -H "Content-Type: application/json" \
+#        -d '{
+#        "model": "meta-llama/Llama-3.1-8B",
+#        "prompt": "Mark Elliot Zuckerberg is an American businessman who co-founded the social media service Facebook and its parent company Meta Platforms, of which he is the chairman, chief executive officer, and controlling shareholder. Zuckerberg has been the subject of multiple lawsuits regarding the creation and ownership of the website as well as issues such as user privacy. Born in White Plains, New York, Zuckerberg briefly attended Harvard College, where he launched Facebook in February 2004 with his roommates Eduardo Saverin, Andrew McCollum, Dustin Moskovitz and Chris Hughes. Zuckerberg took the company public in May 2012 with majority shares. He became the worlds youngest self-made billionaire[a] in 2008, at age 23, and has consistently ranked among the worlds wealthiest individuals. According to Forbes, Zuckerbergs estimated net worth stood at US$221.2 billion as of May 2025, making him the second-richest individual in the world.[2] Intel opened its first international manufacturing facility in 1972, in Malaysia, which would host multiple Intel operations, before opening assembly facilities and semiconductor plants in Singapore and Jerusalem in the early 1980s, and manufacturing and development centers in China, India, and Costa Rica in the 1990s.[31] By the early 1980s, its business was dominated by DRAM chips. However, increased competition from Japanese semiconductor manufacturers had, by 1983, dramatically reduced the profitability of this market. The growing success of the IBM personal computer, based on an Intel microprocessor, was among factors that convinced Gordon Moore (CEO since 1975) to shift the companys focus to microprocessors and to change fundamental aspects of that business model. Moores decision to sole-source Intels 386 chip played into the companys continuing success.",
+#        "max_tokens": 5,
+#        "temperature": 0
+#        }'
+# curl -X POST -s http://localhost:9192/v1/completions \
+#       -H "Content-Type: application/json" \
+#       -d '{
+#       "model": "meta-llama/Llama-3.1-8B",
+#       "prompt": ["This was a few months ago. It was my day off and the only thing I had to do was pick my girlfriend up from work at 9:00 pm. Other than that, I was free to loaf on the couch from morning to night, which is what I did. Around 8:00, I decided to shower before I left the house. Now, I have short hair that dries pretty quickly, but I am deeply vain about it, so I always dry it with the hairdryer right after I shower to ensure my hair doesnt get flat and weird. I never skip this step. So, I get out of the shower, start drying my hair... And then I wake up in bed. Its half an hour later. I feel like garbage, my entire body mysteriously hurts, and I am slowly realizing that I dont remember exiting the bathroom. My only clear thought is: oh shit, its 9:00! I have to pick up my girlfriend! Better shake myself awake. I dragged my aching carcass back to the bathroom, and this was when I noticed the massive blisters forming all over my hand. I was still pretty out of it, but I knew that this was a hospital visit kind of burn. My girlfriend then called to check in because I was running late and, despite my undoubtedly convincing argument that I was still perfectly fine to drive, she immediately knew something was wrong. She cabbed home and we got a ride to the ER. Turns out, I had my first ever seizure! It seems like during the seizure, I clenched the hairdryer in my fist and had it pointed at my other hand long enough to thoroughly cook it. The tissue loss is pretty deep in some areas and there was concerns about me retaining my mobility, but its been healing well so far.", 
+#       "Mark Elliot Zuckerberg is an American businessman who co-founded the social media service Facebook and its parent company Meta Platforms, of which he is the chairman, chief executive officer, and controlling shareholder. Zuckerberg has been the subject of multiple lawsuits regarding the creation and ownership of the website as well as issues such as user privacy. Born in White Plains, New York, Zuckerberg briefly attended Harvard College, where he launched Facebook in February 2004 with his roommates Eduardo Saverin, Andrew McCollum, Dustin Moskovitz and Chris Hughes. Zuckerberg took the company public in May 2012 with majority shares. He became the worlds youngest self-made billionaire[a] in 2008, at age 23, and has consistently ranked among the worlds wealthiest individuals. According to Forbes, Zuckerbergs estimated net worth stood at US$221.2 billion as of May 2025, making him the second-richest individual in the world.[2]"],
+#       "max_tokens": 2,
+#       "temperature": 0
+#       }'
+  #sleep 10000
   # Run lm eval for this model
   echo "Running tests for $model_name"
-  TEST_MODEL=$model_name python -m pytest -s -x ${GIT_ROOT}/tests/v1/kv_connector/nixl_integration/test_accuracy.py
+  TEST_MODEL=$model_name python -m pytest -s -x test_accuracy.py
 
   # Clean up before running next model
   cleanup_instances
diff --git a/tests/v1/kv_connector/nixl_integration/run_benchmark_profile.sh b/tests/v1/kv_connector/nixl_integration/run_benchmark_profile.sh
new file mode 100644
index 000000000000..af2d65ef6b20
--- /dev/null
+++ b/tests/v1/kv_connector/nixl_integration/run_benchmark_profile.sh
@@ -0,0 +1,280 @@
+#!/bin/bash
+set -xe
+
+# Models to run
+#MODELS=(
+#    "Qwen/Qwen3-0.6B"
+#)
+#MODELS=(
+#	"meta-llama/Llama-3.1-8B"
+#)
+
+
+MODELS=(
+    "/root/software/data/pytorch/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/"
+)
+
+CMD="hl-prof-config --use-template profile_api --hw-trace off"
+eval "$CMD &" 
+export VLLM_USE_V1=1
+export VLLM_SKIP_WARMUP=True
+export PT_HPU_LAZY_MODE=1
+export HABANA_PROFILE=1 
+#Enable full vLLM Profiler and instruct where to save the profiling:
+export VLLM_PROFILER_ENABLED=1 
+export VLLM_TORCH_PROFILER_DIR=./
+
+
+# Number of prefill and decode instances to create
+NUM_PREFILL_INSTANCES=${NUM_PREFILL_INSTANCES:-1} # Default to 1
+NUM_DECODE_INSTANCES=${NUM_DECODE_INSTANCES:-1}   # Default to 1
+PREFILLER_TP_SIZE=${PREFILLER_TP_SIZE:-1}
+DECODER_TP_SIZE=${DECODER_TP_SIZE:-1}
+
+# Find the git repository root directory
+#GIT_ROOT=$(git rev-parse --show-toplevel)
+GIT_ROOT="/home/vllm-nixl/vllm"
+
+#SMI_BIN=$(which nvidia-smi || which rocm-smi)
+
+# Trap the SIGINT signal (triggered by Ctrl+C)
+trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT
+
+# Waits for vLLM to start.
+wait_for_server() {
+  local port=$1
+  timeout 1200 bash -c "
+    until curl -s localhost:${port}/v1/completions > /dev/null; do
+      sleep 1
+    done" && return 0 || return 1
+}
+
+# Function to clean up previous instances
+cleanup_instances() {
+  echo "Cleaning up any running vLLM instances..."
+  pkill -f "vllm serve" || true
+  sleep 2
+}
+
+# Handle to get model-specific arguments for deepseek
+get_model_args() {
+  local model_name=$1
+  local extra_args=""
+
+  if [[ "$model_name" == "deepseek-ai/deepseek-vl2-tiny" ]]; then
+    extra_args="--hf_overrides '{\"architectures\": [\"DeepseekVLV2ForCausalLM\"]}' --trust-remote-code"
+  fi
+
+  echo "$extra_args"
+}
+
+get_num_gpus() {
+  if [[ "$SMI_BIN" == *"nvidia"* ]]; then
+    echo "$($SMI_BIN --query-gpu=name --format=csv,noheader | wc -l)"
+  else
+    echo "$($SMI_BIN -l | grep GPU | wc -l)"
+  fi
+}
+
+# Function to run tests for a specific model
+run_tests_for_model() {
+  local model_name=$1
+  echo "================================"
+  echo "Testing model: $model_name"
+  echo "================================"
+
+  # Get model-specific arguments
+  local model_args=$(get_model_args "$model_name")
+
+  # Arrays to store all hosts and ports
+  PREFILL_HOSTS=()
+  PREFILL_PORTS=()
+  DECODE_HOSTS=()
+  DECODE_PORTS=()
+
+  # Start prefill instances
+  for i in $(seq 0 $((NUM_PREFILL_INSTANCES-1))); do
+    # Calculate GPU ID - we'll distribute across available GPUs
+    #GPU_ID=$((i % $(get_num_gpus)))
+    GPU_ID=2
+
+    # Calculate port number (base port + instance number)
+    PORT=$((8300))
+    # Calculate side channel port. Avoid clash with with TP workers. 
+    SIDE_CHANNEL_PORT=$((6559 + i))
+
+    echo "Starting prefill instance $i on GPU $GPU_ID, port $PORT"
+
+    # Build the command with or without model-specific args
+    BASE_CMD="RANK=0 HABANA_VISIBLE_DEVICES=2 UCX_TLS=tcp VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
+    --port $PORT \
+    --long_prefill_token_threshold 8192 \
+    --max_num_batched_tokens 8192 \
+    --gpu-memory-utilization 0.3 \
+    --disable-log-requests \
+    --tensor-parallel-size $PREFILLER_TP_SIZE \
+    --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
+
+    if [ -n "$model_args" ]; then
+    FULL_CMD="$BASE_CMD $model_args"
+    else
+    FULL_CMD="$BASE_CMD"
+    fi
+
+    eval "$FULL_CMD &"
+
+    # Store host and port for proxy configuration
+    PREFILL_HOSTS+=("localhost")
+    PREFILL_PORTS+=($PORT)
+  done
+
+  # Start decode instances
+  for i in $(seq 0 $((NUM_DECODE_INSTANCES-1))); do
+    # Calculate GPU ID - we'll distribute across available GPUs, starting from after prefill GPUs
+    #GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(get_num_gpus)))
+    # Calculate port number (base port + instance number)
+    PORT=$((8400))
+    # Calculate side channel port
+    SIDE_CHANNEL_PORT=$((5559 + i * $DECODER_TP_SIZE))
+
+    echo "Starting decode instance $i on GPU $GPU_ID, port $PORT"
+
+    # Build the command with or without model-specific args
+    BASE_CMD="RANK=1 HABANA_VISIBLE_DEVICES=3 UCX_TLS=tcp VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
+    --port $PORT \
+    --gpu-memory-utilization 0.3 \
+    --tensor-parallel-size $DECODER_TP_SIZE \
+    --long_prefill_token_threshold 8192 \
+    --max_num_batched_tokens 8192 \
+    --disable-log-requests \
+    --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
+
+    if [ -n "$model_args" ]; then
+    FULL_CMD="$BASE_CMD $model_args"
+    else
+    FULL_CMD="$BASE_CMD"
+    fi
+
+    eval "$FULL_CMD &"
+
+    # Store host and port for proxy configuration
+    DECODE_HOSTS+=("localhost")
+    DECODE_PORTS+=($PORT)
+  done
+
+  # Wait for all instances to start
+  for PORT in "${PREFILL_PORTS[@]}"; do
+    echo "Waiting for prefill instance on port $PORT to start..."
+    wait_for_server $PORT
+  done
+
+  for PORT in "${DECODE_PORTS[@]}"; do
+    echo "Waiting for decode instance on port $PORT to start..."
+    wait_for_server $PORT
+  done
+
+  # Build the command for the proxy server with all the hosts and ports
+  PROXY_CMD="python toy_proxy_server.py --port 9191"
+
+  # Add all prefill hosts and ports
+  PROXY_CMD+=" --prefiller-hosts ${PREFILL_HOSTS[@]}"
+  PROXY_CMD+=" --prefiller-ports ${PREFILL_PORTS[@]}"
+
+  # Add all decode hosts and ports
+  PROXY_CMD+=" --decoder-hosts ${DECODE_HOSTS[@]}"
+  PROXY_CMD+=" --decoder-ports ${DECODE_PORTS[@]}"
+
+  # Start the proxy server
+  echo "Starting proxy server with command: $PROXY_CMD"
+  $PROXY_CMD &
+
+  # Wait for the proxy to start
+  sleep 50
+  
+# curl -X POST -s http://localhost:9191/v1/completions \
+#	-H "Content-Type: application/json" \
+#	-d '{
+#	"model": "meta-llama/Llama-3.1-8B",
+#	"prompt": "Mark Elliot Zuckerberg is an American businessman who co-founded the social media service Facebook and its parent company Meta Platforms, of which he is the chairman, chief executive officer, and controlling shareholder. Zuckerberg has been the subject of multiple lawsuits regarding the creation and ownership of the website as well as issues such as user privacy. Born in White Plains, New York, Zuckerberg briefly attended Harvard College, where he launched Facebook in February 2004 with his roommates Eduardo Saverin, Andrew McCollum, Dustin Moskovitz and Chris Hughes. Zuckerberg took the company public in May 2012 with majority shares. He became the worlds youngest self-made billionaire[a] in 2008, at age 23, and has consistently ranked among the worlds wealthiest individuals. According to Forbes, Zuckerbergs estimated net worth stood at US$221.2 billion as of May 2025, making him the second-richest individual in the world.[2]",
+#	"max_tokens": 5,
+#	"temperature": 0
+#	}'
+#	sleep 5
+#	echo "--------------------===================-------------"
+#curl -X POST -s http://localhost:9191/v1/completions \
+#        -H "Content-Type: application/json" \
+#        -d '{
+#        "model": "/root/software/data/pytorch/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/",
+#        "prompt": "Mark Elliot Zuckerberg is an American businessman who co-founded the social media service Facebook and its parent company Meta Platforms, of which he is the chairman, chief executive officer, and controlling shareholder. Zuckerberg has been the subject of multiple lawsuits regarding the creation and ownership of the website as well as issues such as user privacy. Born in White Plains, New York, Zuckerberg briefly attended Harvard College, where he launched Facebook in February 2004 with his roommates Eduardo Saverin, Andrew McCollum, Dustin Moskovitz and Chris Hughes. Zuckerberg took the company public in May 2012 with majority shares. He became the worlds youngest self-made billionaire[a] in 2008, at age 23, and has consistently ranked among the worlds wealthiest individuals. According to Forbes, Zuckerbergs estimated net worth stood at US$221.2 billion as of May 2025, making him the second-richest individual in the world.[2] Intel opened its first international manufacturing facility in 1972, in Malaysia, which would host multiple Intel operations, before opening assembly facilities and semiconductor plants in Singapore and Jerusalem in the early 1980s, and manufacturing and development centers in China, India, and Costa Rica in the 1990s.[31] By the early 1980s, its business was dominated by DRAM chips. However, increased competition from Japanese semiconductor manufacturers had, by 1983, dramatically reduced the profitability of this market. The growing success of the IBM personal computer, based on an Intel microprocessor, was among factors that convinced Gordon Moore (CEO since 1975) to shift the companys focus to microprocessors and to change fundamental aspects of that business model. Moores decision to sole-source Intels 386 chip played into the companys continuing success.",
+#        "max_tokens": 5,
+#        "temperature": 0
+#        }'
+ #curl -X POST -s http://localhost:9191/v1/completions \
+ #      -H "Content-Type: application/json" \
+ #      -d '{
+ #      "model": "/root/software/data/pytorch/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/",
+ #      "prompt": ["This was a few months ago. It was my day off and the only thing I had to do was pick my girlfriend up from work at 9:00 pm. Other than that, I was free to loaf on the couch from morning to night, which is what I did. Around 8:00, I decided to shower before I left the house. Now, I have short hair that dries pretty quickly, but I am deeply vain about it, so I always dry it with the hairdryer right after I shower to ensure my hair doesnt get flat and weird. I never skip this step. So, I get out of the shower, start drying my hair... And then I wake up in bed. Its half an hour later. I feel like garbage, my entire body mysteriously hurts, and I am slowly realizing that I dont remember exiting the bathroom. My only clear thought is: oh shit, its 9:00! I have to pick up my girlfriend! Better shake myself awake. I dragged my aching carcass back to the bathroom, and this was when I noticed the massive blisters forming all over my hand. I was still pretty out of it, but I knew that this was a hospital visit kind of burn. My girlfriend then called to check in because I was running late and, despite my undoubtedly convincing argument that I was still perfectly fine to drive, she immediately knew something was wrong. She cabbed home and we got a ride to the ER. Turns out, I had my first ever seizure! It seems like during the seizure, I clenched the hairdryer in my fist and had it pointed at my other hand long enough to thoroughly cook it. The tissue loss is pretty deep in some areas and there was concerns about me retaining my mobility, but its been healing well so far.", 
+ #      "Mark Elliot Zuckerberg is an American businessman who co-founded the social media service Facebook and its parent company Meta Platforms, of which he is the chairman, chief executive officer, and controlling shareholder. Zuckerberg has been the subject of multiple lawsuits regarding the creation and ownership of the website as well as issues such as user privacy. Born in White Plains, New York, Zuckerberg briefly attended Harvard College, where he launched Facebook in February 2004 with his roommates Eduardo Saverin, Andrew McCollum, Dustin Moskovitz and Chris Hughes. Zuckerberg took the company public in May 2012 with majority shares. He became the worlds youngest self-made billionaire[a] in 2008, at age 23, and has consistently ranked among the worlds wealthiest individuals. According to Forbes, Zuckerbergs estimated net worth stood at US$221.2 billion as of May 2025, making him the second-richest individual in the world.[2]"],
+ #      "max_tokens": 2,
+ #      "temperature": 0
+ #      }'
+ # curl -X POST http://localhost:8300/start_profile
+ # curl -X POST http://localhost:8400/start_profile
+ # sleep 3
+
+  # Run lm eval for this model
+  echo "Running tests for $model_name"
+  #TEST_MODEL=$model_name python -m pytest -s -x test_accuracy.py
+  python3 ../../../../benchmarks/benchmark_serving.py \
+   --port 9191 \
+   --seed 12345 \
+   --model /root/software/data/pytorch/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/ \
+   --dataset-name random \
+   --random-input-len 8192 \
+   --random-output-len 5 \
+   --num-prompts 10 \
+   --burstiness 100 \
+   --request-rate 0.1 \
+   --metric-percentiles 95 \
+   --percentile-metrics ttft,tpot,itl,e2el \
+   --backend openai \
+   --endpoint /v1/completions \
+   --ignore-eos 
+  
+  sleep 10
+  curl -X POST http://localhost:8300/start_profile
+  curl -X POST http://localhost:8400/start_profile
+
+  python3 ../../../../benchmarks/benchmark_serving.py \
+   --port 9191 \
+   --seed 12345 \
+   --model /root/software/data/pytorch/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/ \
+   --dataset-name random \
+   --random-input-len 8192 \
+   --random-output-len 5 \
+   --num-prompts 10 \
+   --burstiness 100 \
+   --request-rate 0.1 \
+   --metric-percentiles 95 \
+   --percentile-metrics ttft,tpot,itl,e2el \
+   --backend openai \
+   --endpoint /v1/completions \
+   --ignore-eos
+
+
+  sleep 10
+  curl -X POST http://localhost:8300/stop_profile
+  curl -X POST http://localhost:8400/stop_profile
+
+  sleep 10
+  # Clean up before running next model
+  cleanup_instances
+  sleep 3
+}
+
+# Run tests for each model
+for model in "${MODELS[@]}"; do
+  run_tests_for_model "$model"
+done
+
+echo "All tests completed!"
diff --git a/tests/v1/kv_connector/nixl_integration/run_benchmark_test.sh b/tests/v1/kv_connector/nixl_integration/run_benchmark_test.sh
new file mode 100755
index 000000000000..6ac657a65073
--- /dev/null
+++ b/tests/v1/kv_connector/nixl_integration/run_benchmark_test.sh
@@ -0,0 +1,310 @@
+#!/bin/bash
+set -xe
+
+# Models to run
+#MODELS=(
+#    "Qwen/Qwen3-0.6B"
+#)
+#MODELS=(
+#	"meta-llama/Llama-3.1-8B"
+#)
+
+
+MODELS=(
+    "/root/software/data/pytorch/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/"
+)
+
+export VLLM_USE_V1=1
+export VLLM_SKIP_WARMUP=True
+export PT_HPU_LAZY_MODE=1
+export VLLM_EXPONENTIAL_BUCKETING=False
+#export VLLM_PROMPT_BS_BUCKET_MIN=1
+#export VLLM_PROMPT_SEQ_BUCKET_MIN=1
+export VLLM_PROMPT_SEQ_BUCKET_MIN=8192
+export VLLM_PROMPT_SEQ_BUCKET_STEP=8192
+export VLLM_PROMPT_SEQ_BUCKET_MAX=8192
+export VLLM_DECODE_BLOCK_BUCKET_MIN=1024
+export VLLM_DECODE_BLOCK_BUCKET_MAX=1184
+export VLLM_USE_PADDING_AWARE_SCHEDULING=1
+
+# Number of prefill and decode instances to create
+NUM_PREFILL_INSTANCES=${NUM_PREFILL_INSTANCES:-1} # Default to 1
+NUM_DECODE_INSTANCES=${NUM_DECODE_INSTANCES:-1}   # Default to 1
+PREFILLER_TP_SIZE=${PREFILLER_TP_SIZE:-1}
+DECODER_TP_SIZE=${DECODER_TP_SIZE:-1}
+
+# Find the git repository root directory
+#GIT_ROOT=$(git rev-parse --show-toplevel)
+GIT_ROOT="/home/vllm-nixl/vllm"
+
+#SMI_BIN=$(which nvidia-smi || which rocm-smi)
+
+# Trap the SIGINT signal (triggered by Ctrl+C)
+trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT
+
+# Waits for vLLM to start.
+wait_for_server() {
+  local port=$1
+  timeout 1200 bash -c "
+    until curl -s localhost:${port}/v1/completions > /dev/null; do
+      sleep 1
+    done" && return 0 || return 1
+}
+
+# Function to clean up previous instances
+cleanup_instances() {
+  echo "Cleaning up any running vLLM instances..."
+  pkill -f "vllm serve" || true
+  sleep 2
+}
+
+# Handle to get model-specific arguments for deepseek
+get_model_args() {
+  local model_name=$1
+  local extra_args=""
+
+  if [[ "$model_name" == "deepseek-ai/deepseek-vl2-tiny" ]]; then
+    extra_args="--hf_overrides '{\"architectures\": [\"DeepseekVLV2ForCausalLM\"]}' --trust-remote-code"
+  fi
+
+  echo "$extra_args"
+}
+
+get_num_gpus() {
+  if [[ "$SMI_BIN" == *"nvidia"* ]]; then
+    echo "$($SMI_BIN --query-gpu=name --format=csv,noheader | wc -l)"
+  else
+    echo "$($SMI_BIN -l | grep GPU | wc -l)"
+  fi
+}
+
+# Function to run tests for a specific model
+run_tests_for_model() {
+  local model_name=$1
+  echo "================================"
+  echo "Testing model: $model_name"
+  echo "================================"
+
+  # Get model-specific arguments
+  local model_args=$(get_model_args "$model_name")
+
+  # Arrays to store all hosts and ports
+  PREFILL_HOSTS=()
+  PREFILL_PORTS=()
+  DECODE_HOSTS=()
+  DECODE_PORTS=()
+
+  # Start prefill instances
+  for i in $(seq 0 $((NUM_PREFILL_INSTANCES-1))); do
+    # Calculate GPU ID - we'll distribute across available GPUs
+    #GPU_ID=$((i % $(get_num_gpus)))
+    GPU_ID=2
+
+    # Calculate port number (base port + instance number)
+    PORT=$((8300 + i))
+    # Calculate side channel port. Avoid clash with with TP workers. 
+    SIDE_CHANNEL_PORT=$((5559 + i))
+
+    echo "Starting prefill instance $i on GPU $GPU_ID, port $PORT"
+
+    # Build the command with or without model-specific args
+    BASE_CMD="HABANA_VISIBLE_DEVICES=0 RANK=0 UCX_TLS=rc,ud,ib VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
+    --port $PORT \
+    --long_prefill_token_threshold 8192 \
+    --max_num_batched_tokens 8192 \
+    --disable-log-requests \
+    --gpu-memory-utilization 0.3 \
+    --tensor-parallel-size $PREFILLER_TP_SIZE \
+    --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
+
+    if [ -n "$model_args" ]; then
+    FULL_CMD="$BASE_CMD $model_args"
+    else
+    FULL_CMD="$BASE_CMD"
+    fi
+
+    eval "$FULL_CMD &"
+
+    # Store host and port for proxy configuration
+    PREFILL_HOSTS+=("localhost")
+    PREFILL_PORTS+=($PORT)
+  done
+
+  # Start decode instances
+  for i in $(seq 0 $((NUM_DECODE_INSTANCES-1))); do
+    # Calculate GPU ID - we'll distribute across available GPUs, starting from after prefill GPUs
+    #GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(get_num_gpus)))
+    # Calculate port number (base port + instance number)
+    PORT=$((8400 + i))
+    # Calculate side channel port
+    SIDE_CHANNEL_PORT=$((4659 + i * $DECODER_TP_SIZE))
+
+    echo "Starting decode instance $i on GPU $GPU_ID, port $PORT"
+
+    # Build the command with or without model-specific args
+    BASE_CMD="HABANA_VISIBLE_DEVICES=1 RANK=1 UCX_TLS=rc,ud,ib VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
+    --port $PORT \
+    --gpu-memory-utilization 0.3 \
+    --tensor-parallel-size $DECODER_TP_SIZE \
+    --long_prefill_token_threshold 8192 \
+    --max_num_batched_tokens 8192 \
+    --disable-log-requests \
+    --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
+
+    if [ -n "$model_args" ]; then
+    FULL_CMD="$BASE_CMD $model_args"
+    else
+    FULL_CMD="$BASE_CMD"
+    fi
+
+    eval "$FULL_CMD &"
+
+    # Store host and port for proxy configuration
+    DECODE_HOSTS+=("localhost")
+    DECODE_PORTS+=($PORT)
+  done
+
+  # Wait for all instances to start
+  for PORT in "${PREFILL_PORTS[@]}"; do
+    echo "Waiting for prefill instance on port $PORT to start..."
+    wait_for_server $PORT
+  done
+
+  for PORT in "${DECODE_PORTS[@]}"; do
+    echo "Waiting for decode instance on port $PORT to start..."
+    wait_for_server $PORT
+  done
+
+  # Build the command for the proxy server with all the hosts and ports
+  PROXY_CMD="python toy_proxy_server.py --port 9111"
+
+  # Add all prefill hosts and ports
+  PROXY_CMD+=" --prefiller-hosts ${PREFILL_HOSTS[@]}"
+  PROXY_CMD+=" --prefiller-ports ${PREFILL_PORTS[@]}"
+
+  # Add all decode hosts and ports
+  PROXY_CMD+=" --decoder-hosts ${DECODE_HOSTS[@]}"
+  PROXY_CMD+=" --decoder-ports ${DECODE_PORTS[@]}"
+
+  # Start the proxy server
+  echo "Starting proxy server with command: $PROXY_CMD"
+  $PROXY_CMD &
+
+  # Wait for the proxy to start
+  sleep 100
+  
+# curl -X POST -s http://localhost:9111/v1/completions \
+#	-H "Content-Type: application/json" \
+#	-d '{
+#	"model": "meta-llama/Llama-3.1-8B",
+#	"prompt": "Mark Elliot Zuckerberg is an American businessman who co-founded the social media service Facebook and its parent company Meta Platforms, of which he is the chairman, chief executive officer, and controlling shareholder. Zuckerberg has been the subject of multiple lawsuits regarding the creation and ownership of the website as well as issues such as user privacy. Born in White Plains, New York, Zuckerberg briefly attended Harvard College, where he launched Facebook in February 2004 with his roommates Eduardo Saverin, Andrew McCollum, Dustin Moskovitz and Chris Hughes. Zuckerberg took the company public in May 2012 with majority shares. He became the worlds youngest self-made billionaire[a] in 2008, at age 23, and has consistently ranked among the worlds wealthiest individuals. According to Forbes, Zuckerbergs estimated net worth stood at US$221.2 billion as of May 2025, making him the second-richest individual in the world.[2]",
+#	"max_tokens": 5,
+#	"temperature": 0
+#	}'
+#	sleep 5
+#	echo "--------------------===================-------------"
+#curl -X POST -s http://localhost:9111/v1/completions \
+#        -H "Content-Type: application/json" \
+#        -d '{
+#        "model": "/root/software/data/pytorch/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/",
+#        "prompt": "Mark Elliot Zuckerberg is an American businessman who co-founded the social media service Facebook and its parent company Meta Platforms, of which he is the chairman, chief executive officer, and controlling shareholder. Zuckerberg has been the subject of multiple lawsuits regarding the creation and ownership of the website as well as issues such as user privacy. Born in White Plains, New York, Zuckerberg briefly attended Harvard College, where he launched Facebook in February 2004 with his roommates Eduardo Saverin, Andrew McCollum, Dustin Moskovitz and Chris Hughes. Zuckerberg took the company public in May 2012 with majority shares. He became the worlds youngest self-made billionaire[a] in 2008, at age 23, and has consistently ranked among the worlds wealthiest individuals. According to Forbes, Zuckerbergs estimated net worth stood at US$221.2 billion as of May 2025, making him the second-richest individual in the world.[2] Intel opened its first international manufacturing facility in 1972, in Malaysia, which would host multiple Intel operations, before opening assembly facilities and semiconductor plants in Singapore and Jerusalem in the early 1980s, and manufacturing and development centers in China, India, and Costa Rica in the 1990s.[31] By the early 1980s, its business was dominated by DRAM chips. However, increased competition from Japanese semiconductor manufacturers had, by 1983, dramatically reduced the profitability of this market. The growing success of the IBM personal computer, based on an Intel microprocessor, was among factors that convinced Gordon Moore (CEO since 1975) to shift the companys focus to microprocessors and to change fundamental aspects of that business model. Moores decision to sole-source Intels 386 chip played into the companys continuing success.",
+#        "max_tokens": 5,
+#        "temperature": 0
+#        }'
+ #curl -X POST -s http://localhost:9111/v1/completions \
+ #      -H "Content-Type: application/json" \
+ #      -d '{
+ #      "model": "/root/software/data/pytorch/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/",
+ #      "prompt": ["This was a few months ago. It was my day off and the only thing I had to do was pick my girlfriend up from work at 9:00 pm. Other than that, I was free to loaf on the couch from morning to night, which is what I did. Around 8:00, I decided to shower before I left the house. Now, I have short hair that dries pretty quickly, but I am deeply vain about it, so I always dry it with the hairdryer right after I shower to ensure my hair doesnt get flat and weird. I never skip this step. So, I get out of the shower, start drying my hair... And then I wake up in bed. Its half an hour later. I feel like garbage, my entire body mysteriously hurts, and I am slowly realizing that I dont remember exiting the bathroom. My only clear thought is: oh shit, its 9:00! I have to pick up my girlfriend! Better shake myself awake. I dragged my aching carcass back to the bathroom, and this was when I noticed the massive blisters forming all over my hand. I was still pretty out of it, but I knew that this was a hospital visit kind of burn. My girlfriend then called to check in because I was running late and, despite my undoubtedly convincing argument that I was still perfectly fine to drive, she immediately knew something was wrong. She cabbed home and we got a ride to the ER. Turns out, I had my first ever seizure! It seems like during the seizure, I clenched the hairdryer in my fist and had it pointed at my other hand long enough to thoroughly cook it. The tissue loss is pretty deep in some areas and there was concerns about me retaining my mobility, but its been healing well so far.", 
+ #      "Mark Elliot Zuckerberg is an American businessman who co-founded the social media service Facebook and its parent company Meta Platforms, of which he is the chairman, chief executive officer, and controlling shareholder. Zuckerberg has been the subject of multiple lawsuits regarding the creation and ownership of the website as well as issues such as user privacy. Born in White Plains, New York, Zuckerberg briefly attended Harvard College, where he launched Facebook in February 2004 with his roommates Eduardo Saverin, Andrew McCollum, Dustin Moskovitz and Chris Hughes. Zuckerberg took the company public in May 2012 with majority shares. He became the worlds youngest self-made billionaire[a] in 2008, at age 23, and has consistently ranked among the worlds wealthiest individuals. According to Forbes, Zuckerbergs estimated net worth stood at US$221.2 billion as of May 2025, making him the second-richest individual in the world.[2]"],
+ #      "max_tokens": 2,
+ #      "temperature": 0
+ #      }'
+  sleep 2
+  # Run lm eval for this model
+  echo "Running tests for $model_name"
+  #TEST_MODEL=$model_name python -m pytest -s -x test_accuracy.py
+  #python3 ../../../../benchmarks/benchmark_serving.py \
+  # --port 9111 \
+  # --seed "$(date +%s)" \
+  # --model /root/software/data/pytorch/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/ \
+  # --dataset-name random \
+  # --random-input-len 8192 \
+  # --random-output-len 256 \
+  # --num-prompts 32 \
+  # --burstiness 100 \
+  # --request-rate 3.6 \
+  # --metric-percentiles 95 \
+  # --percentile-metrics ttft,tpot,itl,e2el \
+  # --backend openai \
+  # --endpoint /v1/completions \
+  # --ignore-eos 
+
+  #sleep 100
+  #python3 ../../../../benchmarks/benchmark_serving.py \
+  # --port 8300 \
+  # --seed "$(date +%s)" \
+  # --model /root/software/data/pytorch/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/ \
+  # --dataset-name random \
+  # --random-input-len 8192 \
+  # --random-output-len 200 \
+  # --num-prompts 100 \
+  # --burstiness 100 \
+  # --request-rate 3.6 \
+  # --metric-percentiles 95 \
+  # --percentile-metrics ttft,tpot,itl,e2el \
+  # --backend openai \
+  # --endpoint /v1/completions \
+  # --ignore-eos
+  qps=(0.5) #(0.1 0.25 0.5 1 2 3 4) # 5)
+  # explicit num_prompts mapping (must have same length as qps[])
+  num_prompts=(32) #(32 64 128 256 256 256 256) # 256)
+  input_len=8192
+  output_len=256 #56
+
+  # just sanity‐check lengths
+  if [ "${#qps[@]}" -ne "${#num_prompts[@]}" ]; then
+    echo "❌ qps[] and num_prompts[] must be the same length"
+    exit 1
+  fi
+
+  for i in "${!qps[@]}"; do
+    q=${qps[$i]}
+    np=${num_prompts[$i]}
+
+    ts=$(date +"%Y%m%d_%H%M%S")
+    logf="./nixlresult/run_in${input_len}_out${output_len}_qps${q//./p}_$ts.log"
+
+    echo "[$(date +"%Y-%m-%d %H:%M:%S")] input=${input_len}, output=${output_len}, qps=${q}, num_prompts=${np}" \
+      | tee "$logf"
+
+    python3 ../../../../benchmarks/benchmark_serving.py \
+      --port 9111 \
+      --seed "$(date +%s)" \
+      --model /root/software/data/pytorch/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/ \
+      --tokenizer /root/software/data/pytorch/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/ \
+      --dataset-name random \
+      --random-input-len "$input_len" \
+      --random-output-len 256 \
+      --num-prompts "$np" \
+      --request-rate "$q" \
+      --percentile-metrics ttft,tpot,itl,e2el \
+      --burstiness 100 \
+      --backend openai \
+      --endpoint /v1/completions \
+      --ignore-eos \
+      2>&1 | tee -a "$logf"
+   
+  done
+
+  # Clean up before running next model
+  cleanup_instances
+  sleep 3
+}
+
+# Run tests for each model
+for model in "${MODELS[@]}"; do
+  run_tests_for_model "$model"
+done
+
+echo "All tests completed!"
diff --git a/tests/v1/kv_connector/nixl_integration/run_benchmark_test_heter.sh b/tests/v1/kv_connector/nixl_integration/run_benchmark_test_heter.sh
new file mode 100644
index 000000000000..83a10fbbdb83
--- /dev/null
+++ b/tests/v1/kv_connector/nixl_integration/run_benchmark_test_heter.sh
@@ -0,0 +1,318 @@
+#!/bin/bash
+set -xe
+
+# Models to run
+#MODELS=(
+#    "Qwen/Qwen3-0.6B"
+#)
+#MODELS=(
+#	"meta-llama/Llama-3.1-8B"
+#)
+
+
+MODELS=(
+    "/root/software/data/pytorch/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/"
+)
+#MODELS=(
+#    "Qwen/Qwen3-0.6B"
+#)
+export VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS=1000000
+export VLLM_RPC_TIMEOUT=1000000000
+export NIXL_LOG_LEVEL=debug 
+#export UCX_LOG_LEVEL=debug
+export VLLM_USE_V1=1
+export VLLM_SKIP_WARMUP=True
+export PT_HPU_LAZY_MODE=1
+export VLLM_EXPONENTIAL_BUCKETING=False
+export VLLM_PROMPT_BS_BUCKET_MIN=1
+export VLLM_PROMPT_SEQ_BUCKET_MIN=1
+export VLLM_PROMPT_SEQ_BUCKET_MIN=8192
+export VLLM_PROMPT_SEQ_BUCKET_STEP=8192
+export VLLM_PROMPT_SEQ_BUCKET_MAX=8192
+export VLLM_DECODE_BLOCK_BUCKET_MIN=1024
+export VLLM_DECODE_BLOCK_BUCKET_MAX=1184
+export VLLM_USE_PADDING_AWARE_SCHEDULING=1
+export DECODER_TP_RATIO=2
+
+# Number of prefill and decode instances to create
+NUM_PREFILL_INSTANCES=${NUM_PREFILL_INSTANCES:-1} # Default to 1
+NUM_DECODE_INSTANCES=${NUM_DECODE_INSTANCES:-1}   # Default to 1
+PREFILLER_TP_SIZE=2 #${PREFILLER_TP_SIZE:-1}
+DECODER_TP_SIZE=4 #${DECODER_TP_SIZE:-1}
+
+
+# Find the git repository root directory
+#GIT_ROOT=$(git rev-parse --show-toplevel)
+GIT_ROOT="/home/vllm-nixl/vllm"
+
+#SMI_BIN=$(which nvidia-smi || which rocm-smi)
+
+# Trap the SIGINT signal (triggered by Ctrl+C)
+trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT
+
+# Waits for vLLM to start.
+wait_for_server() {
+  local port=$1
+  timeout 1200 bash -c "
+    until curl -s localhost:${port}/v1/completions > /dev/null; do
+      sleep 1
+    done" && return 0 || return 1
+}
+
+# Function to clean up previous instances
+cleanup_instances() {
+  echo "Cleaning up any running vLLM instances..."
+  pkill -f "vllm serve" || true
+  sleep 2
+}
+
+# Handle to get model-specific arguments for deepseek
+get_model_args() {
+  local model_name=$1
+  local extra_args=""
+
+  if [[ "$model_name" == "deepseek-ai/deepseek-vl2-tiny" ]]; then
+    extra_args="--hf_overrides '{\"architectures\": [\"DeepseekVLV2ForCausalLM\"]}' --trust-remote-code"
+  fi
+
+  echo "$extra_args"
+}
+
+get_num_gpus() {
+  if [[ "$SMI_BIN" == *"nvidia"* ]]; then
+    echo "$($SMI_BIN --query-gpu=name --format=csv,noheader | wc -l)"
+  else
+    echo "$($SMI_BIN -l | grep GPU | wc -l)"
+  fi
+}
+
+# Function to run tests for a specific model
+run_tests_for_model() {
+  local model_name=$1
+  echo "================================"
+  echo "Testing model: $model_name"
+  echo "================================"
+
+  # Get model-specific arguments
+  local model_args=$(get_model_args "$model_name")
+
+  # Arrays to store all hosts and ports
+  PREFILL_HOSTS=()
+  PREFILL_PORTS=()
+  DECODE_HOSTS=()
+  DECODE_PORTS=()
+
+  # Start prefill instances
+  for i in $(seq 0 $((NUM_PREFILL_INSTANCES-1))); do
+    # Calculate GPU ID - we'll distribute across available GPUs
+    #GPU_ID=$((i % $(get_num_gpus)))
+    GPU_ID=2
+
+    # Calculate port number (base port + instance number)
+    PORT=$((8300 + i))
+    # Calculate side channel port. Avoid clash with with TP workers. 
+    SIDE_CHANNEL_PORT=$((5559 + i))
+
+    echo "Starting prefill instance $i on GPU $GPU_ID, port $PORT"
+
+    # Build the command with or without model-specific args
+    BASE_CMD="MY_ROLE=PREFILL UCX_TLS=rc,ud,ib VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
+    --port $PORT \
+    --long_prefill_token_threshold 8192 \
+    --max_num_batched_tokens 8192 \
+    --disable-log-requests \
+    --gpu-memory-utilization 0.3 \
+    --tensor-parallel-size $PREFILLER_TP_SIZE \
+    --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
+
+    if [ -n "$model_args" ]; then
+    FULL_CMD="$BASE_CMD $model_args"
+    else
+    FULL_CMD="$BASE_CMD"
+    fi
+
+    eval "$FULL_CMD &"
+
+    # Store host and port for proxy configuration
+    PREFILL_HOSTS+=("localhost")
+    PREFILL_PORTS+=($PORT)
+  done
+
+  # Start decode instances
+  for i in $(seq 0 $((NUM_DECODE_INSTANCES-1))); do
+    # Calculate GPU ID - we'll distribute across available GPUs, starting from after prefill GPUs
+    #GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(get_num_gpus)))
+    # Calculate port number (base port + instance number)
+    PORT=$((8400 + i))
+    # Calculate side channel port
+    SIDE_CHANNEL_PORT=$((4659 + i * $DECODER_TP_SIZE))
+
+    echo "Starting decode instance $i on GPU $GPU_ID, port $PORT"
+
+    # Build the command with or without model-specific args
+    BASE_CMD="MY_ROLE=DECODE UCX_TLS=rc,ud,ib VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
+    --port $PORT \
+    --gpu-memory-utilization 0.3 \
+    --tensor-parallel-size $DECODER_TP_SIZE \
+    --long_prefill_token_threshold 8192 \
+    --max_num_batched_tokens 8192 \
+    --disable-log-requests \
+    --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
+
+    if [ -n "$model_args" ]; then
+    FULL_CMD="$BASE_CMD $model_args"
+    else
+    FULL_CMD="$BASE_CMD"
+    fi
+
+    eval "$FULL_CMD &"
+
+    # Store host and port for proxy configuration
+    DECODE_HOSTS+=("localhost")
+    DECODE_PORTS+=($PORT)
+  done
+
+  # Wait for all instances to start
+  for PORT in "${PREFILL_PORTS[@]}"; do
+    echo "Waiting for prefill instance on port $PORT to start..."
+    wait_for_server $PORT
+  done
+
+  for PORT in "${DECODE_PORTS[@]}"; do
+    echo "Waiting for decode instance on port $PORT to start..."
+    wait_for_server $PORT
+  done
+
+  # Build the command for the proxy server with all the hosts and ports
+  PROXY_CMD="python toy_proxy_server.py --port 9111"
+
+  # Add all prefill hosts and ports
+  PROXY_CMD+=" --prefiller-hosts ${PREFILL_HOSTS[@]}"
+  PROXY_CMD+=" --prefiller-ports ${PREFILL_PORTS[@]}"
+
+  # Add all decode hosts and ports
+  PROXY_CMD+=" --decoder-hosts ${DECODE_HOSTS[@]}"
+  PROXY_CMD+=" --decoder-ports ${DECODE_PORTS[@]}"
+
+  # Start the proxy server
+  echo "Starting proxy server with command: $PROXY_CMD"
+  $PROXY_CMD &
+
+  # Wait for the proxy to start
+  sleep 10
+  
+# curl -X POST -s http://localhost:9111/v1/completions \
+#	-H "Content-Type: application/json" \
+#	-d '{
+#	"model": "meta-llama/Llama-3.1-8B",
+#	"prompt": "Mark Elliot Zuckerberg is an American businessman who co-founded the social media service Facebook and its parent company Meta Platforms, of which he is the chairman, chief executive officer, and controlling shareholder. Zuckerberg has been the subject of multiple lawsuits regarding the creation and ownership of the website as well as issues such as user privacy. Born in White Plains, New York, Zuckerberg briefly attended Harvard College, where he launched Facebook in February 2004 with his roommates Eduardo Saverin, Andrew McCollum, Dustin Moskovitz and Chris Hughes. Zuckerberg took the company public in May 2012 with majority shares. He became the worlds youngest self-made billionaire[a] in 2008, at age 23, and has consistently ranked among the worlds wealthiest individuals. According to Forbes, Zuckerbergs estimated net worth stood at US$221.2 billion as of May 2025, making him the second-richest individual in the world.[2]",
+#	"max_tokens": 5,
+#	"temperature": 0
+#	}'
+#	sleep 5
+#	echo "--------------------===================-------------"
+curl -X POST -s http://localhost:9111/v1/completions \
+        -H "Content-Type: application/json" \
+        -d '{
+        "model": "/root/software/data/pytorch/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/",
+        "prompt": "Mark Elliot Zuckerberg is an American businessman who co-founded the social media service Facebook and its parent company Meta Platforms, of which he is the chairman, chief executive officer, and controlling shareholder. Zuckerberg has been the subject of multiple lawsuits regarding the creation and ownership of the website as well as issues such as user privacy. Born in White Plains, New York, Zuckerberg briefly attended Harvard College, where he launched Facebook in February 2004 with his roommates Eduardo Saverin, Andrew McCollum, Dustin Moskovitz and Chris Hughes. Zuckerberg took the company public in May 2012 with majority shares. He became the worlds youngest self-made billionaire[a] in 2008, at age 23, and has consistently ranked among the worlds wealthiest individuals. According to Forbes, Zuckerbergs estimated net worth stood at US$221.2 billion as of May 2025, making him the second-richest individual in the world.[2] Intel opened its first international manufacturing facility in 1972, in Malaysia, which would host multiple Intel operations, before opening assembly facilities and semiconductor plants in Singapore and Jerusalem in the early 1980s, and manufacturing and development centers in China, India, and Costa Rica in the 1990s.[31] By the early 1980s, its business was dominated by DRAM chips. However, increased competition from Japanese semiconductor manufacturers had, by 1983, dramatically reduced the profitability of this market. The growing success of the IBM personal computer, based on an Intel microprocessor, was among factors that convinced Gordon Moore (CEO since 1975) to shift the companys focus to microprocessors and to change fundamental aspects of that business model. Moores decision to sole-source Intels 386 chip played into the companys continuing success.",
+        "max_tokens": 50,
+        "temperature": 0
+        }'
+#curl -X POST -s http://localhost:9111/v1/completions \
+#       -H "Content-Type: application/json" \
+#       -d '{
+#       "model": "/root/software/data/pytorch/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/",
+#       "prompt": ["This was a few months ago. It was my day off and the only thing I had to do was pick my girlfriend up from work at 9:00 pm. Other than that, I was free to loaf on the couch from morning to night, which is what I did. Around 8:00, I decided to shower before I left the house. Now, I have short hair that dries pretty quickly, but I am deeply vain about it, so I always dry it with the hairdryer right after I shower to ensure my hair doesnt get flat and weird. I never skip this step. So, I get out of the shower, start drying my hair... And then I wake up in bed. Its half an hour later. I feel like garbage, my entire body mysteriously hurts, and I am slowly realizing that I dont remember exiting the bathroom. My only clear thought is: oh shit, its 9:00! I have to pick up my girlfriend! Better shake myself awake. I dragged my aching carcass back to the bathroom, and this was when I noticed the massive blisters forming all over my hand. I was still pretty out of it, but I knew that this was a hospital visit kind of burn. My girlfriend then called to check in because I was running late and, despite my undoubtedly convincing argument that I was still perfectly fine to drive, she immediately knew something was wrong. She cabbed home and we got a ride to the ER. Turns out, I had my first ever seizure! It seems like during the seizure, I clenched the hairdryer in my fist and had it pointed at my other hand long enough to thoroughly cook it. The tissue loss is pretty deep in some areas and there was concerns about me retaining my mobility, but its been healing well so far.", 
+#       "Mark Elliot Zuckerberg is an American businessman who co-founded the social media service Facebook and its parent company Meta Platforms, of which he is the chairman, chief executive officer, and controlling shareholder. Zuckerberg has been the subject of multiple lawsuits regarding the creation and ownership of the website as well as issues such as user privacy. Born in White Plains, New York, Zuckerberg briefly attended Harvard College, where he launched Facebook in February 2004 with his roommates Eduardo Saverin, Andrew McCollum, Dustin Moskovitz and Chris Hughes. Zuckerberg took the company public in May 2012 with majority shares. He became the worlds youngest self-made billionaire[a] in 2008, at age 23, and has consistently ranked among the worlds wealthiest individuals. According to Forbes, Zuckerbergs estimated net worth stood at US$221.2 billion as of May 2025, making him the second-richest individual in the world.[2]"],
+#       "max_tokens": 100,
+#       "temperature": 0
+#       }'
+  #sleep 2
+  # Run lm eval for this model
+  #echo "Running tests for $model_name"
+  #TEST_MODEL=$model_name python -m pytest -s -x test_accuracy.py
+  #python3 ../../../../benchmarks/benchmark_serving.py \
+  # --port 9111 \
+  # --seed "$(date +%s)" \
+  # --model /root/software/data/pytorch/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/ \
+  # --dataset-name random \
+  # --random-input-len 8192 \
+  # --random-output-len 256 \
+  # --num-prompts 32 \
+  # --burstiness 100 \
+  # --request-rate 3.6 \
+  # --metric-percentiles 95 \
+  # --percentile-metrics ttft,tpot,itl,e2el \
+  # --backend openai \
+  # --endpoint /v1/completions \
+  # --ignore-eos 
+
+  #sleep 100
+  #python3 ../../../../benchmarks/benchmark_serving.py \
+  # --port 8300 \
+  # --seed "$(date +%s)" \
+  # --model /root/software/data/pytorch/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/ \
+  # --dataset-name random \
+  # --random-input-len 8192 \
+  # --random-output-len 200 \
+  # --num-prompts 100 \
+  # --burstiness 100 \
+  # --request-rate 3.6 \
+  # --metric-percentiles 95 \
+  # --percentile-metrics ttft,tpot,itl,e2el \
+  # --backend openai \
+  # --endpoint /v1/completions \
+  # --ignore-eos
+  qps=(0.5) #(0.1 0.25 0.5 1 2 3 4) # 5)
+  # explicit num_prompts mapping (must have same length as qps[])
+  num_prompts=(32) #(32 64 128 256 256 256 256) # 256)
+  input_len=8192
+  output_len=256 #56
+
+  # just sanity‐check lengths
+  #if [ "${#qps[@]}" -ne "${#num_prompts[@]}" ]; then
+  #  echo "❌ qps[] and num_prompts[] must be the same length"
+  #  exit 1
+  #fi
+
+  #for i in "${!qps[@]}"; do
+    #q=${qps[$i]}
+    #np=${num_prompts[$i]}
+
+    #ts=$(date +"%Y%m%d_%H%M%S")
+    #logf="./nixlresult/run_in${input_len}_out${output_len}_qps${q//./p}_$ts.log"
+
+    #echo "[$(date +"%Y-%m-%d %H:%M:%S")] input=${input_len}, output=${output_len}, qps=${q}, num_prompts=${np}" \
+    #  | tee "$logf"
+
+    #python3 ../../../../benchmarks/benchmark_serving.py \
+    #  --port 9111 \
+    #  --seed "$(date +%s)" \
+    #  --model /root/software/data/pytorch/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/ \
+    #  --tokenizer /root/software/data/pytorch/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/ \
+    #  --dataset-name random \
+    #  --random-input-len "$input_len" \
+    #  --random-output-len 256 \
+    #  --num-prompts "$np" \
+    #  --request-rate "$q" \
+    #  --percentile-metrics ttft,tpot,itl,e2el \
+    #  --burstiness 100 \
+    #  --backend openai \
+    #  --endpoint /v1/completions \
+    #  --ignore-eos \
+    #  2>&1 | tee -a "$logf"
+   
+  #done
+
+  # Clean up before running next model
+  cleanup_instances
+  sleep 3
+}
+
+# Run tests for each model
+for model in "${MODELS[@]}"; do
+  run_tests_for_model "$model"
+done
+
+echo "All tests completed!"
diff --git a/tests/v1/kv_connector/nixl_integration/run_hpu_disagg_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_hpu_disagg_accuracy_test.sh
new file mode 100755
index 000000000000..cac822cdb180
--- /dev/null
+++ b/tests/v1/kv_connector/nixl_integration/run_hpu_disagg_accuracy_test.sh
@@ -0,0 +1,222 @@
+#!/bin/bash
+set -xe
+
+# Models to run
+MODELS=(
+    "Qwen/Qwen3-0.6B"
+)
+MODELS=(
+	"meta-llama/Llama-3.1-8B"
+)
+
+export VLLM_SKIP_WARMUP="true"
+#export PT_HPU_LAZY_MODE=1
+
+# Number of prefill and decode instances to create
+NUM_PREFILL_INSTANCES=${NUM_PREFILL_INSTANCES:-1} # Default to 1
+NUM_DECODE_INSTANCES=${NUM_DECODE_INSTANCES:-1}   # Default to 1
+PREFILLER_TP_SIZE=${PREFILLER_TP_SIZE:-1}
+DECODER_TP_SIZE=${DECODER_TP_SIZE:-1}
+
+# Find the git repository root directory
+#GIT_ROOT=$(git rev-parse --show-toplevel)
+GIT_ROOT="/home/vllm-nixl/vllm"
+
+#SMI_BIN=$(which nvidia-smi || which rocm-smi)
+
+# Trap the SIGINT signal (triggered by Ctrl+C)
+trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT
+
+# Waits for vLLM to start.
+wait_for_server() {
+  local port=$1
+  timeout 1200 bash -c "
+    until curl -s localhost:${port}/v1/completions > /dev/null; do
+      sleep 1  
+    done" && return 0 || return 1
+}
+
+# Function to clean up previous instances
+cleanup_instances() {
+  echo "Cleaning up any running vLLM instances..."
+  pkill -f "vllm serve" || true
+  sleep 2
+}
+
+# Handle to get model-specific arguments for deepseek
+get_model_args() {
+  local model_name=$1
+  local extra_args=""
+
+  if [[ "$model_name" == "deepseek-ai/deepseek-vl2-tiny" ]]; then
+    extra_args="--hf_overrides '{\"architectures\": [\"DeepseekVLV2ForCausalLM\"]}' --trust-remote-code"
+  fi
+
+  echo "$extra_args"
+}
+
+get_num_gpus() {
+  if [[ "$SMI_BIN" == *"nvidia"* ]]; then
+    echo "$($SMI_BIN --query-gpu=name --format=csv,noheader | wc -l)"
+  else
+    echo "$($SMI_BIN -l | grep GPU | wc -l)"
+  fi
+}
+
+# Function to run tests for a specific model
+run_tests_for_model() {
+  local model_name=$1
+  echo "================================"
+  echo "Testing model: $model_name"
+  echo "================================"
+
+  # Get model-specific arguments
+  local model_args=$(get_model_args "$model_name")
+
+  # Arrays to store all hosts and ports
+  PREFILL_HOSTS=()
+  PREFILL_PORTS=()
+  DECODE_HOSTS=()
+  DECODE_PORTS=()
+
+  # Start prefill instances
+  for i in $(seq 0 $((NUM_PREFILL_INSTANCES-1))); do
+    # Calculate GPU ID - we'll distribute across available GPUs
+    #GPU_ID=$((i % $(get_num_gpus)))
+    GPU_ID=2
+
+    # Calculate port number (base port + instance number)
+    PORT=$((8300 + i))
+    # Calculate side channel port. Avoid clash with with TP workers. 
+    SIDE_CHANNEL_PORT=$((6559 + i))
+
+    echo "Starting prefill instance $i on GPU $GPU_ID, port $PORT"
+
+    # Build the command with or without model-specific args
+    BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
+    --port $PORT \
+    --enforce-eager \
+    --disable-log-requests \
+    --gpu-memory-utilization 0.3 \
+    --tensor-parallel-size $PREFILLER_TP_SIZE \
+    --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
+
+    if [ -n "$model_args" ]; then
+    FULL_CMD="$BASE_CMD $model_args"
+    else
+    FULL_CMD="$BASE_CMD"
+    fi
+
+    eval "$FULL_CMD &"
+
+    # Store host and port for proxy configuration
+    PREFILL_HOSTS+=("localhost")
+    PREFILL_PORTS+=($PORT)
+  done
+
+  # Start decode instances
+  for i in $(seq 0 $((NUM_DECODE_INSTANCES-1))); do
+    # Calculate GPU ID - we'll distribute across available GPUs, starting from after prefill GPUs
+    #GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(get_num_gpus)))
+    GPU_ID=6
+    # Calculate port number (base port + instance number)
+    PORT=$((8400 + i))
+    # Calculate side channel port
+    SIDE_CHANNEL_PORT=$((5659 + i * $DECODER_TP_SIZE))
+
+    echo "Starting decode instance $i on GPU $GPU_ID, port $PORT"
+
+    # Build the command with or without model-specific args
+    BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
+    --port $PORT \
+    --enforce-eager \
+    --disable-log-requests \
+    --gpu-memory-utilization 0.3 \
+    --tensor-parallel-size $DECODER_TP_SIZE \
+    --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
+
+    if [ -n "$model_args" ]; then
+    FULL_CMD="$BASE_CMD $model_args"
+    else
+    FULL_CMD="$BASE_CMD"
+    fi
+
+    eval "$FULL_CMD &"
+
+    # Store host and port for proxy configuration
+    DECODE_HOSTS+=("localhost")
+    DECODE_PORTS+=($PORT)
+  done
+
+  # Wait for all instances to start
+  for PORT in "${PREFILL_PORTS[@]}"; do
+    echo "Waiting for prefill instance on port $PORT to start..."
+    wait_for_server $PORT
+  done
+
+  for PORT in "${DECODE_PORTS[@]}"; do
+    echo "Waiting for decode instance on port $PORT to start..."
+    wait_for_server $PORT
+  done
+
+  # Build the command for the proxy server with all the hosts and ports
+  PROXY_CMD="python toy_proxy_server.py --port 9192"
+
+  # Add all prefill hosts and ports
+  PROXY_CMD+=" --prefiller-hosts ${PREFILL_HOSTS[@]}"
+  PROXY_CMD+=" --prefiller-ports ${PREFILL_PORTS[@]}"
+
+  # Add all decode hosts and ports
+  PROXY_CMD+=" --decoder-hosts ${DECODE_HOSTS[@]}"
+  PROXY_CMD+=" --decoder-ports ${DECODE_PORTS[@]}"
+
+  # Start the proxy server
+  echo "Starting proxy server with command: $PROXY_CMD"
+  $PROXY_CMD &
+
+  # Wait for the proxy to start
+  sleep 10
+  
+# curl -X POST -s http://localhost:9192/v1/completions \
+#	-H "Content-Type: application/json" \
+#	-d '{
+#	"model": "meta-llama/Llama-3.1-8B",
+#	"prompt": "Mark Elliot Zuckerberg is an American businessman who co-founded the social media service Facebook and its parent company Meta Platforms, of which he is the chairman, chief executive officer, and controlling shareholder. Zuckerberg has been the subject of multiple lawsuits regarding the creation and ownership of the website as well as issues such as user privacy. Born in White Plains, New York, Zuckerberg briefly attended Harvard College, where he launched Facebook in February 2004 with his roommates Eduardo Saverin, Andrew McCollum, Dustin Moskovitz and Chris Hughes. Zuckerberg took the company public in May 2012 with majority shares. He became the worlds youngest self-made billionaire[a] in 2008, at age 23, and has consistently ranked among the worlds wealthiest individuals. According to Forbes, Zuckerbergs estimated net worth stood at US$221.2 billion as of May 2025, making him the second-richest individual in the world.[2]",
+#	"max_tokens": 5,
+#	"temperature": 0
+#	}'
+	sleep 5
+	echo "--------------------===================-------------"
+curl -X POST -s http://localhost:9192/v1/completions \
+        -H "Content-Type: application/json" \
+        -d '{
+        "model": "meta-llama/Llama-3.1-8B",
+        "prompt": "Mark Elliot Zuckerberg is an American businessman who co-founded the social media service Facebook and its parent company Meta Platforms, of which he is the chairman, chief executive officer, and controlling shareholder. Zuckerberg has been the subject of multiple lawsuits regarding the creation and ownership of the website as well as issues such as user privacy. Born in White Plains, New York, Zuckerberg briefly attended Harvard College, where he launched Facebook in February 2004 with his roommates Eduardo Saverin, Andrew McCollum, Dustin Moskovitz and Chris Hughes. Zuckerberg took the company public in May 2012 with majority shares. He became the worlds youngest self-made billionaire[a] in 2008, at age 23, and has consistently ranked among the worlds wealthiest individuals. According to Forbes, Zuckerbergs estimated net worth stood at US$221.2 billion as of May 2025, making him the second-richest individual in the world.[2] Intel opened its first international manufacturing facility in 1972, in Malaysia, which would host multiple Intel operations, before opening assembly facilities and semiconductor plants in Singapore and Jerusalem in the early 1980s, and manufacturing and development centers in China, India, and Costa Rica in the 1990s.[31] By the early 1980s, its business was dominated by DRAM chips. However, increased competition from Japanese semiconductor manufacturers had, by 1983, dramatically reduced the profitability of this market. The growing success of the IBM personal computer, based on an Intel microprocessor, was among factors that convinced Gordon Moore (CEO since 1975) to shift the companys focus to microprocessors and to change fundamental aspects of that business model. Moores decision to sole-source Intels 386 chip played into the companys continuing success.",
+        "max_tokens": 5,
+        "temperature": 0
+        }'
+ curl -X POST -s http://localhost:9192/v1/completions \
+       -H "Content-Type: application/json" \
+       -d '{
+       "model": "meta-llama/Llama-3.1-8B",
+       "prompt": ["This was a few months ago. It was my day off and the only thing I had to do was pick my girlfriend up from work at 9:00 pm. Other than that, I was free to loaf on the couch from morning to night, which is what I did. Around 8:00, I decided to shower before I left the house. Now, I have short hair that dries pretty quickly, but I am deeply vain about it, so I always dry it with the hairdryer right after I shower to ensure my hair doesnt get flat and weird. I never skip this step. So, I get out of the shower, start drying my hair... And then I wake up in bed. Its half an hour later. I feel like garbage, my entire body mysteriously hurts, and I am slowly realizing that I dont remember exiting the bathroom. My only clear thought is: oh shit, its 9:00! I have to pick up my girlfriend! Better shake myself awake. I dragged my aching carcass back to the bathroom, and this was when I noticed the massive blisters forming all over my hand. I was still pretty out of it, but I knew that this was a hospital visit kind of burn. My girlfriend then called to check in because I was running late and, despite my undoubtedly convincing argument that I was still perfectly fine to drive, she immediately knew something was wrong. She cabbed home and we got a ride to the ER. Turns out, I had my first ever seizure! It seems like during the seizure, I clenched the hairdryer in my fist and had it pointed at my other hand long enough to thoroughly cook it. The tissue loss is pretty deep in some areas and there was concerns about me retaining my mobility, but its been healing well so far.", 
+       "Mark Elliot Zuckerberg is an American businessman who co-founded the social media service Facebook and its parent company Meta Platforms, of which he is the chairman, chief executive officer, and controlling shareholder. Zuckerberg has been the subject of multiple lawsuits regarding the creation and ownership of the website as well as issues such as user privacy. Born in White Plains, New York, Zuckerberg briefly attended Harvard College, where he launched Facebook in February 2004 with his roommates Eduardo Saverin, Andrew McCollum, Dustin Moskovitz and Chris Hughes. Zuckerberg took the company public in May 2012 with majority shares. He became the worlds youngest self-made billionaire[a] in 2008, at age 23, and has consistently ranked among the worlds wealthiest individuals. According to Forbes, Zuckerbergs estimated net worth stood at US$221.2 billion as of May 2025, making him the second-richest individual in the world.[2]"],
+       "max_tokens": 2,
+       "temperature": 0
+       }'
+  #sleep 10000
+  # Run lm eval for this model
+  echo "Running tests for $model_name"
+  TEST_MODEL=$model_name python -m pytest -s -x test_accuracy.py
+
+  # Clean up before running next model
+  cleanup_instances
+  sleep 3
+}
+
+# Run tests for each model
+for model in "${MODELS[@]}"; do
+  run_tests_for_model "$model"
+done
+
+echo "All tests completed!"
diff --git a/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh
new file mode 100644
index 000000000000..ea125f99fc42
--- /dev/null
+++ b/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh
@@ -0,0 +1,159 @@
+#!/bin/bash
+set -xe
+
+# Hosts / ports
+PREFILL_HOST=${PREFILL_HOST:-"localhost"}
+PREFILL_PORT=${PREFILL_PORT:-8100}
+PREFILL_NIXL_SIDE_PORT=${PREFILL_NIXL_SIDE_PORT:-5577}
+DECODE_HOST=${DECODE_HOST:-"localhost"}
+DECODE_PORT=${DECODE_PORT:-8200}
+PROXY_HOST=${PROXY_HOST:-"localhost"}
+PROXY_PORT=${PROXY_PORT:-8192}
+BASELINE_HOST=${BASELINE_HOST:-"localhost"}
+BASELINE_PORT=${BASELINE_PORT:-9290}
+
+
+# Model to run.
+MODEL_NAME=${MODEL_NAME:-"meta-llama/Llama-3.2-3B-Instruct"}
+MAX_MODEL_LEN=${MAX_MODEL_LEN:-1024}
+BLOCK_SIZE=${BLOCK_SIZE:-32}
+
+
+# execution env
+GIT_ROOT=$(git rev-parse --show-toplevel)
+EXP_ROOT="${GIT_ROOT}/tests/v1/kv_connector/nixl_integration"
+CONDA_PATH=${CONDA_PATH:-"/home/${USER}/anaconda3"}
+CONDA_ENV_NAME=${CONDA_ENV_NAME:-"nixl"}
+
+OUTPUT_FILE=${OUTPUT_FILE:-"${EXP_ROOT}/.tpu_accuracy_test_outputs.txt"}
+
+# Trap the SIGINT signal (triggered by Ctrl+C)
+trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT
+
+
+# Waits for vLLM server to start.
+wait_for_server() {
+  local host=$1
+  local port=$2
+  timeout 1200 bash -c "
+    until curl -s ${host}:${port}/v1/completions > /dev/null; do
+      sleep 1
+    done" && return 0 || return 1
+}
+
+# Cleanup function
+cleanup() {
+    echo "Caught Ctrl+C, cleaning up..."
+    # Cleanup commands
+    pgrep python | xargs kill -9 || true
+    # pkill -f python || true
+    echo "Cleanup complete. Exiting."
+}
+
+launch_baseline() {
+  BASELINE_BASE_CMD="source ${CONDA_PATH}/bin/activate ${CONDA_ENV_NAME};
+  VLLM_LOGGING_LEVEL=DEBUG \
+  VLLM_USE_V1=1 \
+  PJRT_DEVICE=TPU \
+  VLLM_WORKER_MULTIPROC_METHOD=spawn \
+  VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
+      --host ${BASELINE_HOST} \
+      --port ${BASELINE_PORT} \
+      --max-model-len ${MAX_MODEL_LEN}\
+      --seed 42 \
+      --block-size ${BLOCK_SIZE} \
+      --gpu-memory-utilization 0.5 \
+      --enforce-eager"
+  echo ${BASELINE_BASE_CMD}
+  ssh -tt ${BASELINE_HOST} "${BASELINE_BASE_CMD}" &
+}
+
+launch_pd() {
+  PREFILL_BASE_CMD="source ${CONDA_PATH}/bin/activate ${CONDA_ENV_NAME};
+  UCX_TLS=tcp \
+  VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
+  VLLM_LOGGING_LEVEL=DEBUG \
+  VLLM_USE_V1=1 \
+  VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \
+  VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \
+  PJRT_DEVICE=TPU \
+  VLLM_WORKER_MULTIPROC_METHOD=spawn \
+  VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
+      --host ${PREFILL_HOST} \
+      --port ${PREFILL_PORT} \
+      --max-model-len ${MAX_MODEL_LEN}\
+      --seed 42 \
+      --block-size ${BLOCK_SIZE} \
+      --enforce-eager \
+      --gpu-memory-utilization 0.5 \
+      --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
+
+
+  DECODE_BASE_CMD="source ${CONDA_PATH}/bin/activate ${CONDA_ENV_NAME};
+  UCX_TLS=tcp \
+  VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
+  VLLM_LOGGING_LEVEL=DEBUG \
+  VLLM_USE_V1=1 \
+  PJRT_DEVICE=TPU \
+  VLLM_WORKER_MULTIPROC_METHOD=spawn \
+  VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
+      --host ${DECODE_HOST} \
+      --port ${DECODE_PORT} \
+      --max-model-len ${MAX_MODEL_LEN}\
+      --seed 42 \
+      --block-size ${BLOCK_SIZE} \
+      --enforce-eager \
+      --gpu-memory-utilization 0.5 \
+      --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
+
+  echo ${PREFILL_BASE_CMD}
+  echo ${DECODE_BASE_CMD}
+  sleep 2
+
+  # execute on hosts
+  ssh -tt ${PREFILL_HOST} "${PREFILL_BASE_CMD}" &
+  ssh -tt ${DECODE_HOST} "${DECODE_BASE_CMD}" &
+  sleep 1
+  wait_for_server ${PREFILL_HOST} ${PREFILL_PORT}
+  sleep 1
+  wait_for_server ${DECODE_HOST} ${DECODE_PORT}
+  sleep 1
+}
+
+launch_pd_proxy(){
+  PROXY_BASE_CMD="source ${CONDA_PATH}/bin/activate ${CONDA_ENV_NAME};
+  python3 ${EXP_ROOT}/toy_proxy_server.py \
+  --prefiller-host ${PREFILL_HOST} --prefiller-port ${PREFILL_PORT} \
+  --decoder-host ${DECODE_HOST} --decoder-port ${DECODE_PORT} \
+  --host=${PROXY_HOST} --port ${PROXY_PORT}"
+  echo ${PROXY_BASE_CMD}
+  ssh -tt ${PROXY_HOST} "${PROXY_BASE_CMD}" &
+}
+
+run_tests(){
+  local service_url=$1
+  local mode=$2
+  python3 ${EXP_ROOT}/test_disagg_accuracy.py --service_url=${service_url} --model_name=${MODEL_NAME} --mode=${mode} --file_name=${OUTPUT_FILE}
+}
+
+
+# run non-disagg. baseline & save outputs
+launch_baseline
+sleep 2
+wait_for_server ${BASELINE_HOST} ${BASELINE_PORT}
+run_tests "http://${BASELINE_HOST}:${BASELINE_PORT}" "baseline"
+cleanup
+sleep 10
+
+
+# run disagg. & do exact-match with the outputs from baseline
+launch_pd
+launch_pd_proxy
+sleep 10
+run_tests "http://${PROXY_HOST}:${PROXY_PORT}" "disagg"
+echo "-----P/D success----"
+
+rm ${OUTPUT_FILE}
+cleanup
+
+exit 0
\ No newline at end of file
diff --git a/tests/v1/kv_connector/nixl_integration/test_accuracy.py b/tests/v1/kv_connector/nixl_integration/test_accuracy.py
index e5d66ffeeeb2..f3381a31de64 100644
--- a/tests/v1/kv_connector/nixl_integration/test_accuracy.py
+++ b/tests/v1/kv_connector/nixl_integration/test_accuracy.py
@@ -5,7 +5,7 @@
 import lm_eval
 import openai
 
-BASE_URL = "http://localhost:8192/v1"
+BASE_URL = "http://localhost:9192/v1"
 NUM_CONCURRENT = 100
 TASK = "gsm8k"
 FILTER = "exact_match,strict-match"
diff --git a/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py b/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
index 3d720fe0cafe..6aaa4da38e7b 100644
--- a/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
+++ b/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
@@ -3,7 +3,8 @@
 
 import argparse
 import itertools
-import os
+import logging
+import os,time,sys
 import uuid
 from contextlib import asynccontextmanager
 
@@ -11,9 +12,8 @@
 from fastapi import FastAPI, Request
 from fastapi.responses import StreamingResponse
 
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
 
 
 @asynccontextmanager
@@ -162,6 +162,8 @@ async def send_request_to_service(client_info: dict, endpoint: str,
     }
     req_data["stream"] = False
     req_data["max_tokens"] = 1
+    if "max_completion_tokens" in req_data:
+        req_data["max_completion_tokens"] = 1
     if "stream_options" in req_data:
         del req_data["stream_options"]
     headers = {
@@ -182,6 +184,7 @@ async def stream_service_response(client_info: dict, endpoint: str,
     """
     Asynchronously stream response from a service using a client from the pool.
     """
+    s1 = time.perf_counter()
     headers = {
         "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
         "X-Request-Id": request_id
@@ -196,8 +199,8 @@ async def stream_service_response(client_info: dict, endpoint: str,
             yield chunk
 
 
-@app.post("/v1/completions")
-async def handle_completions(request: Request):
+async def _handle_completions(api: str, request: Request):
+    s1 = time.perf_counter()
     try:
         req_data = await request.json()
         request_id = str(uuid.uuid4())
@@ -206,16 +209,26 @@ async def handle_completions(request: Request):
         prefill_client_info = get_next_client(request.app, 'prefill')
 
         # Send request to prefill service
-        response = await send_request_to_service(prefill_client_info,
-                                                 "/completions", req_data,
-                                                 request_id)
-
+        p = send_request_to_service(prefill_client_info, api,
+                                                 req_data, request_id)
+        s2 = time.perf_counter()
+        #print(f'libin proxy send to prefill {s2-s1}')
+        sys.stdout.flush()
+        response = await p
+        s3 = time.perf_counter()
         # Extract the needed fields
         response_json = response.json()
         kv_transfer_params = response_json.get('kv_transfer_params', {})
         if kv_transfer_params:
+            #remote_block_len = len(kv_transfer_params['remote_block_ids'])
+            #logger.debug('buke: cut:', type(kv_transfer_params), kv_transfer_params['remote_block_ids'],kv_transfer_params['remote_block_ids'][:(remote_block_len//8)*8])
+            
+            #kv_transfer_params['remote_block_ids'] = kv_transfer_params['remote_block_ids'][:(remote_block_len//8)*8]
+            #if remote_block_len % 8 == 0:
+            #    kv_transfer_params['remote_block_ids'] = kv_transfer_params['remote_block_ids'][:(remote_block_len//8)*8-1]
+            #    logger.info('buke hit corner case multiples of 8:', remote_block_len)
             req_data["kv_transfer_params"] = kv_transfer_params
-
+            #print(req_data)
         # Get the next decode client in round-robin fashion
         decode_client_info = get_next_client(request.app, 'decode')
 
@@ -223,26 +236,47 @@ async def handle_completions(request: Request):
 
         # Stream response from decode service
         async def generate_stream():
+            is_first = False
+            s6 = time.perf_counter()
             async for chunk in stream_service_response(decode_client_info,
-                                                       "/completions",
+                                                       api,
                                                        req_data,
                                                        request_id=request_id):
+
+                if is_first is False:
+                    s4 = time.perf_counter()
+                    #print(f'libin debug proxy receive decode 1 total:{s4-s1}| prefill:{s3-s1}| in-between:{s6-s3}|decode:{s4-s6}| {s6=} {s4=}')
+                    sys.stdout.flush()
+                    is_first = True
                 yield chunk
 
-        return StreamingResponse(generate_stream(),
+        re =  StreamingResponse(generate_stream(),
                                  media_type="application/json")
+        s5 =  time.perf_counter()
+        
+        #sys.stdout.flush()
+        return re
 
     except Exception as e:
-        import sys
         import traceback
         exc_info = sys.exc_info()
         print("Error occurred in disagg prefill proxy server"
-              " - completions endpoint")
+              f" - {api} endpoint")
         print(e)
         print("".join(traceback.format_exception(*exc_info)))
         raise
 
 
+@app.post("/v1/completions")
+async def handle_completions(request: Request):
+    return await _handle_completions("/completions", request)
+
+
+@app.post("/v1/chat/completions")
+async def handle_chat_completions(request: Request):
+    return await _handle_completions("/chat/completions", request)
+
+
 @app.get("/healthcheck")
 async def healthcheck():
     """Simple endpoint to check if the server is running."""
diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
index 26f467b244dd..85af87027d0a 100644
--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -445,7 +445,7 @@ def __init__(
             raise ValueError(
                 f"Head size {head_size} is not supported by PagedAttention. "
                 f"Supported head sizes are: {supported_head_sizes}.")
-
+        self.is_prompt = None
         self.attn_type = attn_type
         if (self.attn_type != AttentionType.DECODER
                 and self.attn_type != AttentionType.ENCODER_DECODER
@@ -528,6 +528,7 @@ def forward(
 
         if attn_metadata.is_prompt:
             # Prompt run.
+            self.is_prompt = True
             query_shape = (batch_size, seq_len, self.num_heads, self.head_size)
             kv_shape = (batch_size, seq_len_kv, self.num_kv_heads,
                         self.head_size)
@@ -597,6 +598,7 @@ def forward(
             output = out.reshape(batch_size, seq_len, hidden_size)
         else:
             # Decoding run.
+            self.is_prompt = False
             if not self.sliding_window:
                 block_list = attn_metadata.block_list
                 block_groups = attn_metadata.block_groups
@@ -645,8 +647,8 @@ def common_attention_args(self,
             'batch2block_matmul_op': self.batch2block_matmul,
             'block2batch_matmul_op': self.block2batch_matmul,
             'fsdpa_op': self.fused_scaled_dot_product_attention,
-            'keys_fetch_func': self.k_cache.fetch_from_cache,
-            'values_fetch_func': self.v_cache.fetch_from_cache,
+            'keys_fetch_func': self.k_cache.fetch_from_cache if (not self.is_prompt or not self.use_contiguous_pa) else self.k_cache.fetch_from_cache_prompt,
+            'values_fetch_func': self.v_cache.fetch_from_cache if (not self.is_prompt or not self.use_contiguous_pa) else self.v_cache.fetch_from_cache_prompt,
             'softmax_op': self.softmax,
             'block_list': block_list,
             'key_cache': key_cache,
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py b/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py
index f00f31dde915..19eb8325b248 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
-    KVConnectorBase_V1, KVConnectorRole)
+    KVConnectorBase_V1, KVConnectorRole, KVTransferParams)
 
-__all__ = ["KVConnectorRole", "KVConnectorBase_V1"]
+__all__ = ["KVConnectorRole", "KVConnectorBase_V1", "KVTransferParams"]
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
index f80b5eba235d..e51ee30eb246 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -32,7 +32,7 @@
 
 import enum
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Any, Callable, Literal, Optional
 
 import torch
 
@@ -46,6 +46,12 @@
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
     from vllm.v1.request import Request
 
+# s_tensor_list, d_tensor_list, s_indices, d_indices, direction
+CopyBlocksOp = Callable[[
+    dict[str, torch.Tensor], dict[
+        str, torch.Tensor], list[int], list[int], Literal["h2d", "d2h"]
+], None]
+
 logger = init_logger(__name__)
 
 
@@ -56,8 +62,23 @@ class KVConnectorRole(enum.Enum):
     # Connector running in the worker process
     WORKER = 1
 
+class KVTransferParams:
+    """
+    Abstract KVTransferParams used to send KVTransfer
+    parameters between instances of vLLM.
+
+    Specific instances of KVConnector customize this
+    method for serializing / deserializing msgs sent
+    via the HTTP protocol.
+    """
+
+    @staticmethod
+    def from_raw_dict(
+            raw_dict: Optional[dict[str,
+                                    Any]]) -> Optional["KVTransferParams"]:
+        return None
 
-class KVConnectorMetadata:
+class KVConnectorMetadata(ABC):  # noqa: B024
     """
     Abstract Metadata used to communicate between the
     Scheduler KVConnector and Worker KVConnector.
@@ -66,12 +87,13 @@ class KVConnectorMetadata:
 
 
 class KVConnectorBase_V1(ABC):
+    _KVTransferParams = KVTransferParams
 
     def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
         logger.warning(
             "Initializing KVConnectorBase_V1. This API is experimental and "
             "subject to change in the future as we iterate the design.")
-        self._connector_metadata = KVConnectorMetadata()
+        self._connector_metadata: Optional[KVConnectorMetadata] = None
         self._vllm_config = vllm_config
         self._role = role
 
@@ -102,7 +124,7 @@ def clear_connector_metadata(self) -> None:
         This function should be called by the model runner every time 
         after the model execution.
         """
-        self._connector_metadata = KVConnectorMetadata()
+        self._connector_metadata = None
 
     def _get_connector_metadata(self) -> KVConnectorMetadata:
         """Get the connector metadata.
@@ -112,6 +134,9 @@ def _get_connector_metadata(self) -> KVConnectorMetadata:
         Returns:
             ConnectorMetadata: the connector metadata.
         """
+
+        # Should only be called while set to valid metadata.
+        assert self._connector_metadata is not None
         return self._connector_metadata
 
     def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
@@ -124,6 +149,13 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
         """
         return
 
+    def set_host_xfer_buffer_ops(self, copy_operation: CopyBlocksOp):
+        """
+        Set the xPU-specific ops for copying KV between host and device.
+        Needed when host buffer is used for kv transfer (e.g., in NixlConnector)
+        """
+        return
+
     @abstractmethod
     def start_load_kv(self, forward_context: "ForwardContext",
                       **kwargs) -> None:
@@ -190,7 +222,9 @@ def get_finished(
     ) -> tuple[Optional[set[str]], Optional[set[str]]]:
         """
         Notifies worker-side connector ids of requests that have
-        finished generating tokens.
+        finished generating tokens on the worker.
+        The scheduler process (via the Executors) will use this output
+        to track which workers are done.
 
         Returns:
             ids of requests that have finished asynchronous transfer
@@ -204,6 +238,12 @@ def get_finished(
     # ==============================
     # Scheduler-side methods
     # ==============================
+    def set_kv_transfer_params(self, request: "Request"):
+        """Parse raw KV Transfer params."""
+        assert request.kv_transfer_params is None
+        kv_transfer_params = self._KVTransferParams.from_raw_dict(
+            request.raw_kv_transfer_params)
+        request.kv_transfer_params = kv_transfer_params
 
     @abstractmethod
     def get_num_new_matched_tokens(
@@ -281,3 +321,17 @@ def request_finished(
             returned by the engine.
         """
         return False, None
+
+    @classmethod
+    def get_required_kvcache_layout(
+            cls, vllm_config: "VllmConfig") -> Optional[str]:
+        """
+        Get the required KV cache layout for this connector.
+        Args:
+            vllm_config (VllmConfig): the vllm config.
+
+        Returns:
+            str: the required KV cache layout. e.g. HND, or NHD.
+            None if the connector does not require a specific layout.
+        """
+        return None
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
old mode 100644
new mode 100755
index 7552fc889f2f..54e3e667886e
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -1,12 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import contextlib
+import logging
 import math
+import queue
 import threading
-import time
+import time,os
 import uuid
 from collections import defaultdict
 from collections.abc import Iterator
+from concurrent.futures import Future, ThreadPoolExecutor
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Optional
 
@@ -18,23 +21,27 @@
 from vllm.attention.selector import backend_name_to_enum, get_attn_backend
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
-    KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
+    CopyBlocksOp, KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
 from vllm.distributed.parallel_state import (
     get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size,
     get_tp_group)
+from vllm.distributed.utils import divide
+from vllm.forward_context import ForwardContext
 from vllm.logger import init_logger
-from vllm.platforms import _Backend
+from vllm.platforms import _Backend, current_platform
 from vllm.utils import make_zmq_path, make_zmq_socket, round_down
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.request import RequestStatus
 
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionMetadata
-    from vllm.forward_context import ForwardContext
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
     from vllm.v1.request import Request
 
 Transfer = tuple[int, float]  # (xfer_handle, start_time)
+EngineId = str
+ReqId = str
+
 GET_META_MSG = b"get_meta_msg"
 
 logger = init_logger(__name__)
@@ -42,11 +49,23 @@
 # Lazy import nixl_wrapper to avoid loading nixl_bindings if nixl is not used
 try:
     from nixl._api import nixl_agent as NixlWrapper
+    import habana_frameworks.torch.utils as htutils
+    logger.info("htutils is available")
     logger.info("NIXL is available")
 except ImportError:
+    logger.warning("htutils is not available")
+    htutils = None
     logger.warning("NIXL is not available")
     NixlWrapper = None
 
+# Supported xPUs and types of kv transfer buffer.
+# {xPU: tuple of supported kv buffer types}
+_NIXL_SUPPORTED_XPUS = {
+    "cuda": ("cuda", ),
+    "tpu": ("cpu", ),
+    "hpu": ("cpu", "hpu")
+}
+
 
 class NixlAgentMetadata(
         msgspec.Struct,
@@ -57,7 +76,6 @@ class NixlAgentMetadata(
     agent_metadata: bytes
     kv_caches_base_addr: list[int]
     num_blocks: int
-    tp_size: int
     block_len: int
     attn_backend_name: str
 
@@ -69,42 +87,84 @@ class ReqMeta:
     remote_host: str
     remote_port: int
     remote_engine_id: str
+    tp_size: int
+    # Whether this request had a full/partial in-memory (local) hit so
+    # that only the remainining blocks are required to read.
+    # This is a wicked fix for heterogeneous devices test between
+    # Nvidia device and Habana device since the block ids are not aligned.
+    # We should ideally set kv_transfer_params["is_mem_hit"] to True
+    # by scheduler/worker logic once a memory hit condition is detected.
+    # TODO: remove this field once vllm-fork rebases vllm upstream repo
+    is_mem_hit: bool = False
 
 
 class NixlConnectorMetadata(KVConnectorMetadata):
 
     def __init__(self):
-        self.requests: dict[str, ReqMeta] = {}
+        self.reqs_to_recv: dict[ReqId, ReqMeta] = {}
+        self.reqs_to_save: dict[ReqId, ReqMeta] = {}
+        self.reqs_to_send: dict[ReqId, float] = {}
 
     def add_new_req(
         self,
-        request_id: str,
+        request_id: ReqId,
         local_block_ids: list[int],
         kv_transfer_params: dict[str, Any],
+        load_remote_cache: bool = True,
+        save_to_host: bool = False,
     ):
-        self.requests[request_id] = ReqMeta(
+        # save and load are mutually exclusive
+        assert load_remote_cache ^ save_to_host
+        _req = ReqMeta(
             local_block_ids=local_block_ids,
             remote_block_ids=kv_transfer_params["remote_block_ids"],
             remote_engine_id=kv_transfer_params["remote_engine_id"],
             remote_host=kv_transfer_params["remote_host"],
             remote_port=kv_transfer_params["remote_port"],
+            # P workers don't need to receive tp_size from proxy here.
+            tp_size=kv_transfer_params.get("tp_size", 1),
+            is_mem_hit=kv_transfer_params.get("is_mem_hit", False),
         )
+        if save_to_host:
+            self.reqs_to_save[request_id] = _req
+        if load_remote_cache:
+            self.reqs_to_recv[request_id] = _req
 
 
 class NixlConnector(KVConnectorBase_V1):
 
     def __init__(self, vllm_config: VllmConfig, role: KVConnectorRole):
         assert vllm_config.kv_transfer_config is not None
-        self.engine_id = vllm_config.kv_transfer_config.engine_id
+        assert vllm_config.kv_transfer_config.engine_id is not None
+        self.engine_id: EngineId = vllm_config.kv_transfer_config.engine_id
 
         if role == KVConnectorRole.SCHEDULER:
-            self.connector_scheduler : Optional[NixlConnectorScheduler] = \
-                NixlConnectorScheduler(vllm_config, str(self.engine_id))
+            self.connector_scheduler: Optional[NixlConnectorScheduler] = \
+                NixlConnectorScheduler(vllm_config, self.engine_id)
             self.connector_worker: Optional[NixlConnectorWorker] = None
         elif role == KVConnectorRole.WORKER:
             self.connector_scheduler = None
             self.connector_worker = NixlConnectorWorker(
-                vllm_config, str(self.engine_id))
+                vllm_config, self.engine_id)
+
+    ############################################################
+    # Class Methods
+    ############################################################
+    @classmethod
+    def get_required_kvcache_layout(cls, vllm_config: VllmConfig):
+        if vllm_config.model_config is None:
+            logger.warning_once("Unable to detect current VLLM config. "
+                                "Fallback to default kv cache layout.")
+            return None
+        use_mla = vllm_config.model_config.use_mla
+        if use_mla:
+            # return None when we have mla
+            # as the layout should not matter in that case,
+            # which fallback to the default behavior.
+            return None
+        logger.info_once("NixlConnector setting KV cache "
+                         "layout to HND for better xfer performance.")
+        return "HND"
 
     ############################################################
     # Scheduler Side Methods
@@ -146,17 +206,27 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
         assert self.connector_worker is not None
         self.connector_worker.register_kv_caches(kv_caches)
 
+    def set_host_xfer_buffer_ops(self, copy_operation: CopyBlocksOp):
+        assert self.connector_worker is not None
+        self.connector_worker.set_host_xfer_buffer_ops(copy_operation)
+
     def get_finished(self,
                      finished_req_ids: set[str]) -> tuple[set[str], set[str]]:
         """Get the finished recving and sending requests."""
+        s1 = time.perf_counter()
         assert self.connector_worker is not None
-        return self.connector_worker.get_finished()
+        re= self.connector_worker.get_finished()
+        #logger.info(f'libin debug get_finished {os.getenv('RANK')}, takes {time.perf_counter() - s1}')
+        return re
 
     def start_load_kv(self, forward_context: "ForwardContext",
                       **kwargs) -> None:
+        s1 = time.perf_counter()
         assert self.connector_worker is not None
         assert isinstance(self._connector_metadata, NixlConnectorMetadata)
         self.connector_worker.start_load_kv(self._connector_metadata)
+        #logger.info(f"libin debug start_load_kv return {os.getenv('RANK')}, takes {time.perf_counter() - s1}")
+        #logger.info(f"libin debug start_load_kv return {os.getenv('RANK')}, takes {time.perf_counter() - s1}")
 
     def wait_for_layer_load(self, layer_name: str) -> None:
         """NixlConnector does not do layerwise saving."""
@@ -168,9 +238,14 @@ def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
         pass
 
     def wait_for_save(self):
-        """NixlConnector does not save explicitly."""
-        pass
-
+        s1 = time.perf_counter()
+        assert self.connector_worker is not None
+        assert isinstance(self._connector_metadata, NixlConnectorMetadata)
+        self.connector_worker.rewrite_kv_based_on_transfer_layout(self._connector_metadata)
+        if self.connector_worker.use_host_buffer and \
+           self.connector_worker.copy_blocks:
+            self.connector_worker.save_kv_to_host(self._connector_metadata)
+            # logger.info(f"libin debug wait_for_save {os.getenv('RANK')}, takes {time.perf_counter() - s1}")
 
 class NixlConnectorScheduler:
     """Implementation of Scheduler side methods"""
@@ -178,18 +253,25 @@ class NixlConnectorScheduler:
     def __init__(self, vllm_config: VllmConfig, engine_id: str):
         self.vllm_config = vllm_config
         self.block_size = vllm_config.cache_config.block_size
-        self.engine_id = engine_id
+        self.engine_id: EngineId = engine_id
         self.side_channel_host = envs.VLLM_NIXL_SIDE_CHANNEL_HOST
         self.side_channel_port = (
             envs.VLLM_NIXL_SIDE_CHANNEL_PORT +
-            vllm_config.parallel_config.data_parallel_rank_local *
+            vllm_config.parallel_config.data_parallel_rank *
             vllm_config.parallel_config.tensor_parallel_size)
+        self.use_host_buffer = \
+            vllm_config.kv_transfer_config.kv_buffer_device == "cpu"
         logger.info("Initializing NIXL Scheduler %s", engine_id)
+        self.hetero_blk_id_wa = os.getenv('PT_HPU_HETERO_BLOCK_ID_WA', '1') == '1'
 
-        # Requests that need to start recv.
+        # Requests that need to start recv/send.
         # New requests are added by update_state_after_alloc in
         # the scheduler. Used to make metadata passed to Worker.
-        self._reqs_need_recv: dict[str, tuple[Request, list[int]]] = {}
+        self._reqs_need_recv: dict[ReqId, tuple[Request, list[int]]] = {}
+        self._reqs_need_save: dict[ReqId, tuple[Request, list[int]]] = {}
+        # Reqs to send and their expiration time
+        self._reqs_need_send: dict[ReqId, float] = {}
+
 
     def get_num_new_matched_tokens(
             self, request: "Request",
@@ -197,30 +279,31 @@ def get_num_new_matched_tokens(
         """
         For remote prefill, pull all prompt blocks from remote
         asynchronously relative to engine execution.
-        
+
         Args:
             request (Request): the request object.
             num_computed_tokens (int): the number of locally
                 computed tokens for this request
         Returns:
-            * the number of tokens that can be loaded from the 
+            * the number of tokens that can be loaded from the
               external KV cache beyond what is already computed.
             * true if the external KV cache tokens will be loaded
               asynchronously (between scheduler steps).
         """
 
         params = request.kv_transfer_params
-        logger.debug(
+        logger.info(
             "NIXLConnector get_num_new_matched_tokens: "
             "num_computed_tokens=%s, kv_transfer_params=%s",
             num_computed_tokens, params)
-
+        logger.debug(f'buke get_num_new_matched_tokens: {vars(request)=}')
         if params is not None and params.get("do_remote_prefill"):
             # Remote prefill: get all prompt blocks from remote.
             assert num_computed_tokens % self.block_size == 0
             rounded_num_prompt_tokens = round_down(
                 len(request.prompt_token_ids), self.block_size)
             count = max(rounded_num_prompt_tokens - num_computed_tokens, 0)
+
             if count > 0:
                 return count, True
 
@@ -236,19 +319,53 @@ def update_state_after_alloc(self, request: "Request",
             "NIXLConnector update_state_after_alloc: "
             "num_external_tokens=%s, kv_transfer_params=%s",
             num_external_tokens, params)
-
-        if params is not None and params.get("do_remote_prefill"):
+        logger.debug(f'buke update_state_after_alloc: {vars(request)=}')
+        if not params:
+            return
+        if self.use_host_buffer and params.get("do_remote_decode"):
+            # NOTE: when accelerator is not directly supported by Nixl,
+            # prefilled blocks need to be saved to host memory before transfer.
+
+            # figure out full computed blocks to save
+            block_ids = blocks.get_block_ids()[0]
+            all_full = request.num_tokens % self.block_size == 0
+            full_block_ids = (block_ids if all_full else block_ids[:-1])
+            # TODO: skip the blocks that are already in the host xfer buffer.
+            # Currently, the host xfer buffer block is 1-to-1 mapped to device
+            # kv blocks, so host blocks won't be flushed as long as its device
+            # block is not overwritten; and it will be safe to skip saving them
+            # to host xfer buffer.
+            if full_block_ids:
+                self._reqs_need_save[request.request_id] = \
+                    (request, full_block_ids)
+        elif params.get("do_remote_prefill"):
             if params.get("remote_block_ids"):
                 if all(p in params for p in ("remote_engine_id", "remote_host",
                                              "remote_port")):
-                    # If remote_blocks and num_external_tokens = 0, we have
-                    # a full prefix cache hit on the D worker. We need to call
-                    # send_notif in _read_blocks to free the memory on the P.
-                    local_block_ids = (blocks.get_unhashed_block_ids()
-                                       if num_external_tokens > 0 else [])
-                    # Get unhashed blocks to pull from remote.
-                    self._reqs_need_recv[request.request_id] = (
-                        request, local_block_ids)
+                    if self.hetero_blk_id_wa:
+                        block_ids = blocks.get_block_ids()[0]
+                        local_block_ids = blocks.get_unhashed_block_ids()
+                        if num_external_tokens > 0:
+                            # Get unhashed blocks to pull from remote.
+                            self._reqs_need_recv[request.request_id] = (
+                                request, local_block_ids)
+                            if len(block_ids) > len(local_block_ids):
+                                params["is_mem_hit"] = True
+                                logger.debug(f"jwang {request.request_id=} {block_ids=} {local_block_ids=} need _reqs_need_recv ")
+                        else:
+                            #self._reqs_need_recv[request.request_id] = (request, [])
+                            assert len(block_ids) >= len(local_block_ids), \
+                                f"jwang oops, it really happens {request.request_id=} {block_ids=} {local_block_ids=}"
+                    else:
+                        # If remote_blocks and num_external_tokens = 0, we have
+                        # a full prefix cache hit on the D worker. We need to call
+                        # send_notif in _read_blocks to free the memory on the P.
+                        local_block_ids = (blocks.get_unhashed_block_ids()
+                                           if num_external_tokens > 0 else [])
+                        # Get unhashed blocks to pull from remote.
+                        self._reqs_need_recv[request.request_id] = (
+                            request, local_block_ids)
+
                 else:
                     logger.warning(
                         "Got invalid KVTransferParams: %s. This "
@@ -273,8 +390,22 @@ def build_connector_meta(
                 kv_transfer_params=req.kv_transfer_params,
             )
 
+        for req_id, (req, block_ids) in self._reqs_need_save.items():
+            assert req.kv_transfer_params is not None
+            meta.add_new_req(
+                request_id=req_id,
+                local_block_ids=block_ids,
+                kv_transfer_params=req.kv_transfer_params,
+                load_remote_cache=False,
+                save_to_host=True,
+            )
+
+        meta.reqs_to_send = self._reqs_need_send
+
         # Clear the list once workers start the transfers
         self._reqs_need_recv.clear()
+        self._reqs_need_save.clear()
+        self._reqs_need_send = {}
 
         return meta
 
@@ -292,8 +423,21 @@ def request_finished(
         logger.debug(
             "NIXLConnector request_finished, request_status=%s, "
             "kv_transfer_params=%s", request.status, params)
+        if not params:
+            return False, None
+
+        if params.get("do_remote_prefill"):
+            # If do_remote_prefill is still True when the request is finished,
+            # update_state_after_alloc must not have been called (the request
+            # must have been aborted before it was scheduled).
+            # To avoid stranding the prefill blocks in the prefill instance,
+            # we must add empty block_ids to _reqs_need_recv so that our
+            # worker side will notify and free blocks in the prefill instance.
+            self._reqs_need_recv[request.request_id] = (request, [])
+            params["do_remote_prefill"] = False
+            return False, None
 
-        if (params is None or not params.get("do_remote_decode")
+        if (not params.get("do_remote_decode")
                 or request.status != RequestStatus.FINISHED_LENGTH_CAPPED):
             return False, None
 
@@ -304,6 +448,11 @@ def request_finished(
         # If prompt < block_size, no xfer so free blocks immediately.
         delay_free_blocks = len(computed_block_ids) > 0
 
+        if delay_free_blocks:
+            # Prefill request on remote. It will be read from D upon completion
+            self._reqs_need_send[request.request_id] = time.perf_counter(
+            ) + envs.VLLM_NIXL_ABORT_REQUEST_TIMEOUT
+
         return delay_free_blocks, dict(
             do_remote_prefill=True,
             do_remote_decode=False,
@@ -311,7 +460,7 @@ def request_finished(
             remote_engine_id=self.engine_id,
             remote_host=self.side_channel_host,
             remote_port=self.side_channel_port,
-        )
+            tp_size=self.vllm_config.parallel_config.tensor_parallel_size)
 
 
 class NixlConnectorWorker:
@@ -323,37 +472,72 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str):
             raise RuntimeError("NIXL is not available")
         logger.info("Initializing NIXL wrapper")
         logger.info("Initializing NIXL worker %s", engine_id)
+        self.decoder_tp_ratio = int(os.getenv('DECODER_TP_RATIO', 1))
 
         # Config.
         self.vllm_config = vllm_config
         self.block_size = vllm_config.cache_config.block_size
-
+        # block_factor = G2.block_size/remote_hw.block_size
+        self.block_factor = int(os.getenv('PT_HPU_BLOCK_SIZE_FACTOR', '1'))
+        self.block_shape = None
+        self.is_hetero = os.getenv('PT_HPU_ENABLE_RESTORE_KV_LAYOUT', '0') == '1'
         # Agent.
         self.nixl_wrapper = NixlWrapper(str(uuid.uuid4()), None)
         # Map of engine_id -> {rank0: agent_name0, rank1: agent_name1..}.
-        self._remote_agents: dict[str, dict[int, str]] = defaultdict(dict)
+        self._remote_agents: dict[EngineId, dict[int, str]] = defaultdict(dict)
 
         # NIXL handshake port.
         # NOTE(rob): Within a DP group, each DP rank gets its own
         # base port (which is sent in the KVTransferParams).
         # Each TP rank listens/queries on the base_port + tp_rank.
-        self.side_channel_port = (
+        self.side_channel_port: int = (
             envs.VLLM_NIXL_SIDE_CHANNEL_PORT +
-            vllm_config.parallel_config.data_parallel_rank_local *
+            vllm_config.parallel_config.data_parallel_rank *
             vllm_config.parallel_config.tensor_parallel_size)
 
         # Metadata.
-        self.engine_id = engine_id
+        self.engine_id: EngineId = engine_id
         self.tp_rank = get_tensor_model_parallel_rank()
         self.world_size = get_tensor_model_parallel_world_size()
         self.tp_group = get_tp_group()
+        self.num_blocks = 0
 
         # KV Caches and nixl tracking data.
-        self.kv_caches: dict[str, torch.Tensor] = {}
+        self.device_type = current_platform.device_type
+        self.kv_buffer_device: str = \
+            vllm_config.kv_transfer_config.kv_buffer_device
+        if self.device_type not in _NIXL_SUPPORTED_XPUS:
+            raise RuntimeError(f"{self.device_type} is not supported.")
+        elif self.kv_buffer_device not in _NIXL_SUPPORTED_XPUS[
+                self.device_type]:
+            raise RuntimeError(
+                f"kvconf{vllm_config.kv_transfer_config} {self.device_type} with {self.kv_buffer_device} kv_buffer "
+                "is not supported.")
+        self.device_kv_caches: dict[str, torch.Tensor] = {}
+
+        # cpu kv buffer for xfer
+        # used when xPU memory can not be registered under nixl
+        self.host_xfer_buffers: dict[str, torch.Tensor] = {}
+        self.use_host_buffer = self.kv_buffer_device == "cpu"
+        if self.kv_buffer_device == "cuda" or self.kv_buffer_device == "hpu":
+            self.nixl_memory_type = "VRAM"
+        elif self.kv_buffer_device == "cpu":
+            self.nixl_memory_type = "DRAM"
+        else:
+            raise RuntimeError(
+                f"{self.device_type} with {self.kv_buffer_device} kv_buffer "
+                "is not supported.")
+        if self.kv_buffer_device == "cpu" and self.is_hetero:
+            self.remote_nixl_memory_type = "VRAM"
+        else:
+            self.remote_nixl_memory_type = self.nixl_memory_type
+
+        # Note: host xfer buffer ops when use_host_buffer is True
+        self.copy_blocks: Optional[CopyBlocksOp] = None
 
         # Map of engine_id -> kv_caches_base_addr. For TP case, each local
         # rank will still only pull from a single remote TP worker.
-        self.kv_caches_base_addr: dict[str, list[int]] = {}
+        self.kv_caches_base_addr: dict[EngineId, list[int]] = {}
 
         # Number of NIXL regions. Currently one region per cache
         # (so 1 per layer for MLA, otherwise 2 per layer)
@@ -363,27 +547,31 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str):
         # nixl_prepped_dlist_handle.
         self.src_xfer_side_handle: int = 0
         # Map of engine_id -> nixl_prepped_dlist_handle (int)].
-        self.dst_xfer_side_handles: dict[str, int] = {}
+        self.dst_xfer_side_handles: dict[EngineId, int] = {}
 
         # Map of engine_id -> num_blocks. All ranks in the same deployment will
         # have the same number of blocks.
-        self.dst_num_blocks: dict[str, int] = {}
+        self.dst_num_blocks: dict[EngineId, int] = {}
         self._registered_descs: list[Any] = []
 
         # In progress transfers.
         # [req_id -> list[handle]]
-        self._recving_transfers = defaultdict[str, list[Transfer]](list)
+        self._recving_metadata: dict[ReqId, ReqMeta] = {}
+        self._recving_transfers = defaultdict[ReqId, list[Transfer]](list)
+        # Track the expiration time of requests that are waiting to be sent.
+        self._reqs_to_send: dict[ReqId, float] = {}
 
-        # Complete transfer tracker. Used by the rank 0 to track finished
-        # transactions on ranks 1 to N-1.
-        # [req_id -> count]
-        self._done_recving_count: defaultdict[str,
-                                              int] = defaultdict(lambda: 0)
-        self._done_sending_count: defaultdict[str,
-                                              int] = defaultdict(lambda: 0)
-
-        # Background thread for establishing new connections.
+        # Background thread for handling new handshake requests.
         self._nixl_handshake_listener_t: Optional[threading.Thread] = None
+        # Background thread for initializing new NIXL handshakes.
+        self._handshake_initiation_executor = ThreadPoolExecutor(
+            # NIXL is not guaranteed to be thread-safe, limit 1 worker.
+            max_workers=1,
+            thread_name_prefix="vllm-nixl-handshake-initiator")
+        self._ready_requests = queue.Queue[tuple[ReqId, ReqMeta]]()
+        self._handshake_futures: dict[EngineId, Future[dict[int, str]]] = {}
+        # Protects _handshake_futures and _remote_agents.
+        self._handshake_lock = threading.RLock()
 
         self.vllm_config = vllm_config
         self.block_size = vllm_config.cache_config.block_size
@@ -405,12 +593,20 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str):
         self.backend_name = backend.get_name()
         attn_backend = backend_name_to_enum(self.backend_name)
         self._use_flashinfer = attn_backend == _Backend.FLASHINFER_VLLM_V1
+        self._use_pallas_v1 = attn_backend == _Backend.PALLAS_VLLM_V1
         logger.debug("Detected attention backend %s", self.backend_name)
 
-        self._tp_size: dict[str, int] = {self.engine_id: self.world_size}
+        self._tp_size: dict[EngineId, int] = {self.engine_id: self.world_size}
         # With heterogeneous TP, P must wait for all assigned D TP workers to
         # finish reading before safely freeing the blocks.
-        self.consumer_notification_counts_by_req = defaultdict[str, int](int)
+        self.consumer_notification_counts_by_req = defaultdict[ReqId, int](int)
+        self.req_send_time={}
+        self.req_recv_time={}
+    def __del__(self):
+        """Cleanup background threads on destruction."""
+        self._handshake_initiation_executor.shutdown(wait=False)
+        if self._nixl_handshake_listener_t:
+            self._nixl_handshake_listener_t.join(timeout=0)
 
     @staticmethod
     def _nixl_handshake_listener(metadata: NixlAgentMetadata,
@@ -439,7 +635,13 @@ def _nixl_handshake_listener(metadata: NixlAgentMetadata,
                         "Connection listener got unexpected message %s", msg)
                 sock.send_multipart((identity, b"", encoded_data))
 
-    def _nixl_handshake(self, host: str, port: int):
+    def _nixl_handshake(
+        self,
+        host: str,
+        port: int,
+        remote_tp_size: int,
+        expected_engine_id: str,
+    ) -> dict[int, str]:
         """Do a NIXL handshake with a remote instance."""
 
         start_time = time.perf_counter()
@@ -448,111 +650,226 @@ def _nixl_handshake(self, host: str, port: int):
         # a hack to keep us moving. We will switch when moving to etcd
         # or where we have a single ZMQ socket in the scheduler.
 
-        def handshake(path: str, rank: int) -> NixlAgentMetadata:
-            # Send query for the request.
-            with zmq_ctx(zmq.REQ, path) as sock:
-                sock.send(GET_META_MSG)
-                metadata_bytes = sock.recv()
-                decoder = msgspec.msgpack.Decoder(NixlAgentMetadata)
-                metadata = decoder.decode(metadata_bytes)
-                got_metadata_time = time.perf_counter()
-
-                # Register Remote agent.
-                self.add_remote_agent(metadata, rank)
-                setup_agent_time = time.perf_counter()
-
-                logger.debug("NIXL handshake: get metadata took: %s",
-                             got_metadata_time - start_time)
-                logger.debug("NIXL handshake: add agent took: %s",
-                             setup_agent_time - got_metadata_time)
-                return metadata
-
-        # Handshake with remote agent-rank0 first to get the tp_size of remote
-        path = make_zmq_path("tcp", host, port)
-        logger.debug("Querying master rank metadata on path: %s", path)
-        metadata = handshake(path, 0)
-
-        # Handshake only with the other TP remote the current local rank will
+        # Handshake only with the remote TP rank that current local rank will
         # pull from. With homogeneous TP it happens to be the same rank_i.
-        tp_ratio = self._tp_size[self.engine_id] // metadata.tp_size
+        tp_ratio = self._tp_size[self.engine_id] // remote_tp_size
         p_remote_rank = self.tp_rank // tp_ratio
-        if p_remote_rank > 0:
-            path = make_zmq_path("tcp", host, port + p_remote_rank)
-            logger.debug("Querying metadata on path: %s at remote rank %s",
-                         path, p_remote_rank)
-            _ = handshake(path, p_remote_rank)
+        path = make_zmq_path("tcp", host, port + p_remote_rank)
+        logger.debug("Querying metadata on path: %s at remote rank %s", path,
+                     p_remote_rank)
+        # Send query for the request.
+        with zmq_ctx(zmq.REQ, path) as sock:
+            sock.send(GET_META_MSG)
+            metadata_bytes = sock.recv()
+            decoder = msgspec.msgpack.Decoder(NixlAgentMetadata)
+            metadata = decoder.decode(metadata_bytes)
+            got_metadata_time = time.perf_counter()
+            logger.debug("NIXL handshake: get metadata took: %s",
+                         got_metadata_time - start_time)
+
+            # Ensure engine id matches.
+            if metadata.engine_id != expected_engine_id:
+                raise RuntimeError(f"Remote NIXL agent engine ID mismatch. "
+                                   f"Expected {expected_engine_id},"
+                                   f"received {metadata.engine_id}.")
+
+            # Register Remote agent.
+            remote_agent_name = self.add_remote_agent(metadata, p_remote_rank,
+                                                      remote_tp_size)
+            setup_agent_time = time.perf_counter()
+            logger.debug("NIXL handshake: add agent took: %s",
+                         setup_agent_time - got_metadata_time)
+
+        # Remote rank -> agent name.
+        return {p_remote_rank: remote_agent_name}
+
+    def initialize_host_xfer_buffer(
+            self, kv_caches: dict[str, torch.Tensor]) -> None:
+        """
+        Initialize transfer buffer in CPU mem for accelerators
+        NOT directly supported by NIXL (e.g., tpu)
+        """
+        xfer_buffers: dict[str, torch.Tensor] = {}
+        try:
+            for layer_name, kv_cache in kv_caches.items():
+                if self.device_type == "hpu":
+                    kv_shape = (2, *kv_cache[0].shape)
+                    kv_dtype = kv_cache[0].dtype
+                    xfer_buffers[layer_name] = torch.empty(kv_shape,
+                                                       dtype=kv_dtype,
+                                                       device="cpu")
+                else:
+                    kv_shape = kv_cache.shape
+                    kv_dtype = kv_cache.dtype
+                    xfer_buffers[layer_name] = torch.empty(kv_shape,
+                                                       dtype=kv_dtype,
+                                                       device="cpu")
+        except MemoryError as e:
+            logger.error("NIXLConnectorWorker gets %s.", e)
+            raise
+
+        self.host_xfer_buffers = xfer_buffers
+
+    def set_host_xfer_buffer_ops(self, copy_operation: CopyBlocksOp):
+        """Assign copy (d2h, h2d) operations when host buffer is used."""
+        assert self.use_host_buffer
+        self.copy_blocks = copy_operation
+    nixl1 = None
+    def _background_nixl_handshake(self, req_id: str,
+                                   remote_engine_id: EngineId, meta: ReqMeta):
+        # Do NIXL handshake in background and add to _ready_requests when done.
+        global nixl1
+        nixl1 = time.perf_counter()
+        fut = self._handshake_futures.get(remote_engine_id)
+        if fut is None:
+            fut = self._handshake_initiation_executor.submit(
+                self._nixl_handshake, meta.remote_host, meta.remote_port,
+                meta.tp_size, remote_engine_id)
+            self._handshake_futures[remote_engine_id] = fut
+
+            def done_callback(f: Future[dict[int, str]], eid=remote_engine_id):
+                with self._handshake_lock:
+                    nixl2 = time.perf_counter()
+                    global nixl1
+                    # logger.info(f"libin debug done_callback {os.getenv('RANK')}, HANDSHAKE takes:{nixl2-nixl1}")
+                    del self._handshake_futures[eid]
+                    try:
+                        self._remote_agents[eid] = f.result()
+                    except Exception:
+                        logger.exception("Handshake with %s failed", eid)
+
+            fut.add_done_callback(done_callback)
+
+        # TODO: handle failure state of future in the
+        # callback, we want to fail the request in this case.
+        def request_ready(_f: Future[Any], entry=(req_id, meta)):
+            self._ready_requests.put(entry)
+
+        fut.add_done_callback(request_ready)
 
     def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
         """Register the KV Cache data in nixl."""
-
         _, first_kv_cache = next(iter(kv_caches.items()))
-        kv_elem_size = first_kv_cache.element_size()
+        if self.device_type == "hpu":
+            kv_elem_size = first_kv_cache[0][0].dtype.itemsize
+        else:
+            kv_elem_size = first_kv_cache.element_size()
+
+        if self.use_host_buffer:
+            self.initialize_host_xfer_buffer(kv_caches=kv_caches)
+            assert len(self.host_xfer_buffers) == len(kv_caches), (
+                f"host_buffer: {len(self.host_xfer_buffers)}, "
+                f"kv_caches: {len(kv_caches)}")
+            xfer_buffers = self.host_xfer_buffers
+        else:
+            xfer_buffers = kv_caches
+            assert not self.host_xfer_buffers, (
+                "host_xfer_buffer should not be initialized when "
+                f"kv_buffer_device is {self.kv_buffer_device}")
 
         # TODO(tms): Find a more robust way to detect and handle MLA
         # NOTE (NickLucche) To move blocks efficiently with NIXL, the expected
         # KV memory layout is HND, as opposed to the default NHD. Note that it
         # will only affects the strides. For MLA instead, we make require no
         # such thing and resort to the standard layout.
-        use_mla = len(first_kv_cache.shape) == 3
-        assert use_mla == self.use_mla
-
-        # TODO (NickLucche) not compatible with hybrid allocator. Enforce check
-        # once it goes live, as a single kv layout is expected for xfers.
-        if use_mla:
-            # MLA case.
+        use_mla = len(first_kv_cache.shape) == 3 if self.device_type != "hpu" else False
+        if self.device_type == "tpu":
+            assert not use_mla, f"{self.kv_buffer_device} does not support MLA."
+            assert self._use_pallas_v1, f"attn backend: {self.backend_name}"
+            # tpu (v1) kv shape per layer:
+            # (num_blocks, block_size, num_kv_heads * 2, head_size)
             self.num_blocks = first_kv_cache.shape[0]
-            block_rank = 2  # [block_size, latent_dim]
+            block_rank = 3  # [block_size, kv_heads, head_dim]
             block_shape = first_kv_cache.shape[-block_rank:]
-            block_size, kv_latent_dim = block_shape
-            self.slot_size_bytes = kv_elem_size * kv_latent_dim
-        else:
-            # [2 (k and v), num_blocks, ...]
-            if self._use_flashinfer:
-                # FlashInfer swaps 2<->num_blocks dimensions.
+            block_size, n_kv_heads_x_2, head_dim = block_shape
+            self.slot_size_bytes = kv_elem_size * n_kv_heads_x_2 * head_dim
+        elif self.device_type == "cuda":
+            assert use_mla == self.use_mla
+            # TODO (NickLucche) not compatible with hybrid allocator.
+            # Enforce check once it goes live, as a single kv layout
+            # is expected for xfers.
+            if use_mla:
+                # MLA case.
                 self.num_blocks = first_kv_cache.shape[0]
-                block_rank = 4  # [2, block_size, kv_heads, head_dim]
+                block_rank = 2  # [block_size, latent_dim]
+                block_shape = first_kv_cache.shape[-block_rank:]
+                block_size, kv_latent_dim = block_shape
+                self.slot_size_bytes = kv_elem_size * kv_latent_dim
             else:
-                self.num_blocks = first_kv_cache.shape[1]
-                block_rank = 3  # [block_size, kv_heads, head_dim]
-            block_shape = first_kv_cache.shape[-block_rank:]
+                # [2 (k and v), num_blocks, ...]
+                if self._use_flashinfer:
+                    # FlashInfer swaps 2<->num_blocks dimensions.
+                    self.num_blocks = first_kv_cache.shape[0]
+                    block_rank = 4  # [2, block_size, kv_heads, head_dim]
+                else:
+                    self.num_blocks = first_kv_cache.shape[1]
+                    block_rank = 3  # [block_size, kv_heads, head_dim]
+                block_shape = first_kv_cache.shape[-block_rank:]
+                block_size, n_kv_heads, head_dim = block_shape[-3:]
+
+                # head size in bytes.
+                self.slot_size_bytes = kv_elem_size * n_kv_heads * head_dim
+            assert block_size == self.block_size
+        elif self.device_type == "hpu":
+            # habana kv_cache: [2, num_blocks*block_size, kv_heads, head_dim]
+            #from remote_pdb import RemotePdb; RemotePdb('0.0.0.0', 4444).set_trace()
+            self.num_blocks = first_kv_cache[0].shape[0] // self.block_size
+            block_rank = 3  # [block_size, kv_heads, head_dim]
+            block_shape = first_kv_cache[0].shape[-block_rank:]
+            block_shape = list(block_shape)
+            block_shape[0] = block_shape[0] // self.num_blocks
+            block_shape = torch.Size(block_shape)
             block_size, n_kv_heads, head_dim = block_shape[-3:]
+            self.block_shape = [block_size, n_kv_heads, head_dim]
             # head size in bytes.
             self.slot_size_bytes = kv_elem_size * n_kv_heads * head_dim
-        assert block_size == self.block_size
+        else:
+            raise RuntimeError(
+                f"{self.device_type} ({self.backend_name}) is not supported.")
+
         # TODO(tms): self.block_len needs to be per-layer for sliding window,
         # hybrid attn, etc
         # block size in bytes
         self.block_len = kv_elem_size * math.prod(block_shape)
         logger.info(
-            "Registering KV_Caches: use_mla: %s, num_blocks: %s, "
-            "block_shape: %s, per_layer_kv_cache_shape: %s", use_mla,
-            self.num_blocks, block_shape, first_kv_cache.shape)
-        self.dst_num_blocks[self.engine_id] = self.num_blocks
-        self.kv_caches = kv_caches
+            "Registering KV_Caches. use_mla: %s, kv_buffer_device: %s, "
+            "use_host_buffer: %s, num_blocks: %s, block_shape: %s, "
+            "per_layer_kv_cache_shape: %s", use_mla, self.kv_buffer_device,
+            self.use_host_buffer, self.num_blocks, block_shape,
+            first_kv_cache[0].shape)
+        self.dst_num_blocks[self.engine_id] = self.num_blocks * self.block_factor
+        self.device_kv_caches = kv_caches
         kv_caches_base_addr = []
         caches_data = []
 
         # Note(tms): I modified this from the original region setup code.
-        # K and V are now in different regions. Advantage is that we can
+        # K and V are now in different regions. Advantage is that we cans
         # elegantly support MLA and any cases where the K and V tensors
         # are non-contiguous (it's not locally guaranteed that they will be)
         # Disadvantage is that the encoded NixlAgentMetadata is now larger
         # (roughly 8KB vs 5KB).
         # Conversely for FlashInfer, K and V are transferred in the same tensor
         # to better exploit the memory layout (ie num_blocks is the first dim).
-        for cache_or_caches in kv_caches.values():
+
+        for cache_or_caches in xfer_buffers.values():
             # Normalize to always be a list of caches
-            cache_list = [cache_or_caches] if use_mla or self._use_flashinfer \
-                else cache_or_caches
+            cache_list = [cache_or_caches] if use_mla \
+                         or self._use_pallas_v1 or self._use_flashinfer \
+                         else cache_or_caches
             for cache in cache_list:
-                base_addr = cache.data_ptr()
+                if self.device_type == "hpu" and not self.use_host_buffer and htutils is not None:
+                    base_addr = htutils.experimental._data_ptr(cache)
+                    logger.debug(f'buke register gaudi memory for gdr: {base_addr=}|{hex(base_addr)=}|{cache.data_ptr()=}')
+                else:
+                    base_addr = cache.data_ptr()
                 region_len = self.num_blocks * self.block_len
-                caches_data.append(
-                    (base_addr, region_len, cache.device.index, ""))
+                # NOTE: use tp_rank for device_id since multi-node TP
+                # is rarely used.
+                caches_data.append((base_addr, region_len, self.tp_rank, ""))
                 kv_caches_base_addr.append(base_addr)
         self.kv_caches_base_addr[self.engine_id] = kv_caches_base_addr
         self.num_regions = len(caches_data)
-        self.num_layers = len(self.kv_caches.keys())
+        self.num_layers = len(xfer_buffers.keys())
 
         # TODO(mgoin): remove this once we have hybrid memory allocator
         # Optimization for models with local attention (Llama 4)
@@ -574,7 +891,8 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
                          self.block_window_per_layer)
             assert len(self.block_window_per_layer) == self.num_layers
 
-        descs = self.nixl_wrapper.get_reg_descs(caches_data, "VRAM")
+        descs = self.nixl_wrapper.get_reg_descs(caches_data,
+                                                self.nixl_memory_type)
         logger.debug("Registering descs: %s", caches_data)
         self.nixl_wrapper.register_memory(descs)
         logger.debug("Done registering descs")
@@ -588,15 +906,17 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
             # could create fewer, but then _get_block_descs_ids needs to
             # select agent_meta.num_blocks instead of self.num_blocks for
             # local descr, and that makes handling regular flow less clean.
-            for block_id in range(self.num_blocks):
-                block_offset = block_id * self.block_len
+            for block_id in range(self.num_blocks * self.block_factor):
+                block_offset = block_id * self.block_len // (self.block_factor)
                 addr = base_addr + block_offset
                 # (addr, len, device id)
-                blocks_data.append((addr, self.block_len, self.tp_rank))
+                # TODO: does device_id matter to DRAM?
+                blocks_data.append((addr, self.block_len//(self.block_factor), self.tp_rank))
         logger.debug("Created %s blocks for src engine %s and rank %s",
                      len(blocks_data), self.engine_id, self.tp_rank)
-
-        descs = self.nixl_wrapper.get_xfer_descs(blocks_data, "VRAM")
+        #print(f'buke: {blocks_data[0:10]=}')
+        descs = self.nixl_wrapper.get_xfer_descs(blocks_data,
+                                                 self.nixl_memory_type)
         # NIXL_INIT_AGENT to be used for preparations of local descs.
         self.src_xfer_side_handle = self.nixl_wrapper.prep_xfer_dlist(
             "NIXL_INIT_AGENT", descs)
@@ -607,7 +927,6 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
             agent_metadata=self.nixl_wrapper.get_agent_metadata(),
             kv_caches_base_addr=self.kv_caches_base_addr[self.engine_id],
             num_blocks=self.num_blocks,
-            tp_size=self.world_size,
             block_len=self.block_len,
             attn_backend_name=self.backend_name)
         ready_event = threading.Event()
@@ -617,24 +936,25 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
             daemon=True,
             name="nixl_handshake_listener")
         self._nixl_handshake_listener_t.start()
-        ready_event.wait()
+        ready_event.wait()  # Wait for listener ZMQ socket to be ready.
 
     def add_remote_agent(self,
                          nixl_agent_meta: NixlAgentMetadata,
-                         remote_tp_rank: int = 0):
+                         remote_tp_rank: int = 0,
+                         remote_tp_size: int = 1) -> str:
         """
         Add the remote NIXL agent and prepare the descriptors for reading cache
         blocks from remote.
 
         In particular, handle both homogeneous and heterogeneous TP. The former
-        requires local rank_i to read from remote rank_i. 
-        The latter, assuming D.world_size > P.world_size, requires that two or 
+        requires local rank_i to read from remote rank_i.
+        The latter, assuming D.world_size > P.world_size, requires that two or
         more local TP worker share the xfer from a single TP worker.
 
         Here's an example:
 
         rank_offset     p_remote_tp_rank
-        (kv split no)    
+        (kv split no)
         --------------------------------
             0                 0      Worker0  ---- 1st half of KV ----> Worker0  [ KV Cache ]
                                                                         /
@@ -647,14 +967,14 @@ def add_remote_agent(self,
 
                                 Decoder TP workers                     Prefix TP workers
                                   (world_size=4)                         (world_size=2)
-                                                 tp_ratio = 4 // 2 = 2                  
-                                
-        Considering the KV Caches, if P-Worker_i has cache size [2, num_blocksP, kv_heads, block_size, head_dim]  
+                                                 tp_ratio = 4 // 2 = 2
+
+        Considering the KV Caches, if P-Worker_i has cache size [2, num_blocksP, kv_heads, block_size, head_dim]
         then D-Worker_j has [2, num_blocksD, kv_heads//tp_ratio, block_size, head_dim]. Mind the "HND" layout format.
-        Assuming num_blocksD >= num_blocksP, D-Worker0 reads from P-Worker0 by preparing the kv_heads//tp_ratio 
+        Assuming num_blocksD >= num_blocksP, D-Worker0 reads from P-Worker0 by preparing the kv_heads//tp_ratio
         first heads from all the slots of all the blocks. D-Worker1 will do the same, but reading the second split
-        along the kv_heads dimension, and so forth until "tp_ratio" D TP workers have pulled from P-Worker0.   
-        
+        along the kv_heads dimension, and so forth until "tp_ratio" D TP workers have pulled from P-Worker0.
+
         Note that the above will also hold true for the homogeneous TP case, where tp_ratio evaluates to 1.
 
         Regarding MLA case, the cache is replicated across TP workers so the rank_offset will just always be 0
@@ -662,28 +982,33 @@ def add_remote_agent(self,
         """ # noqa: E501
         engine_id = nixl_agent_meta.engine_id
         # TODO re-evaluate refreshing for scaling/recovery
-        if remote_tp_rank in self._remote_agents.get(engine_id, ()):
-            return
+        if remote_tp_rank in self._remote_agents.get(engine_id, {}):
+            return self._remote_agents[engine_id][remote_tp_rank]
 
-        if engine_id in self._tp_size:
-            assert self._tp_size[engine_id] == nixl_agent_meta.tp_size
+        if engine_id not in self._tp_size:
+            self._tp_size[engine_id] = remote_tp_size
         else:
-            self._tp_size[engine_id] = nixl_agent_meta.tp_size
+            assert self._tp_size[engine_id] == remote_tp_size
         # We may eventually enable this after asserting equality in cache
         # layout and close outputs.
-        assert nixl_agent_meta.attn_backend_name == self.backend_name
+        assert nixl_agent_meta.attn_backend_name == "FLASH_ATTN_VLLM_V1" or nixl_agent_meta.attn_backend_name == "HPU_ATTN_V1"
 
-        self._remote_agents[engine_id][
-            remote_tp_rank] = self.nixl_wrapper.add_remote_agent(
-                nixl_agent_meta.agent_metadata)
+        remote_agent_name = self.nixl_wrapper.add_remote_agent(
+            nixl_agent_meta.agent_metadata)
 
         # Number of D TP workers reading from a single P TP worker. This is
         # 1 when P and D `--tensor-parallel-size` match.
-        assert self._tp_size[self.engine_id] % self._tp_size[engine_id] == 0, (
-            "Local TP size must be divisible by remote TP size.")
-        tp_ratio = self._tp_size[self.engine_id] // self._tp_size[engine_id]
+        tp_ratio = divide(self._tp_size[self.engine_id],
+                          self._tp_size[engine_id])
         assert tp_ratio > 0, "Decode TP cannot be smaller than prefill TP"
-        if self.use_mla:
+        assert not self._use_pallas_v1 or tp_ratio == 1, \
+               "TPU (pallas_v1) DOES NOT support heterogeneous TP yet."
+
+        # Handle tp_size>num_kv_heads: replicate KV cache.
+        total_num_kv_heads = self.model_config.get_total_num_kv_heads()
+        is_kv_replicated = self._tp_size[engine_id] // total_num_kv_heads >= 1
+
+        if self.use_mla or is_kv_replicated:
             # With MLA the only difference is in the number of blocks.
             remote_block_size = nixl_agent_meta.block_len // (
                 self.slot_size_bytes)
@@ -695,15 +1020,14 @@ def add_remote_agent(self,
                 # Account for joint KV in FlashInfer.
                 remote_block_size //= 2
 
-            assert nixl_agent_meta.block_len == self.block_len * tp_ratio, (
-                "Remote P worker KV layer cache must be of shape [2, N, "
-                "local_kv_heads*tp_ratio, block_size, head_dim] and same dtype."
-            )
-
-        assert self.block_size == remote_block_size, "Remote P worker with " \
-        "different block size is not supported"
+            #assert nixl_agent_meta.block_len == self.block_len * tp_ratio, (
+            #    "Remote P worker KV layer cache must be of shape [2, N, "
+            #    "local_kv_heads*tp_ratio, block_size, head_dim] and same dtype."
+            #)
 
-        assert self.num_blocks >= nixl_agent_meta.num_blocks
+        #assert self.block_size == remote_block_size, (
+        #    "Remote P worker with different block size is not supported "
+        #    f"{self.block_size=} {remote_block_size=}")
 
         # Create dst descs and xfer side handles. TP workers have same #blocks.
         if engine_id in self.dst_num_blocks:
@@ -716,46 +1040,77 @@ def add_remote_agent(self,
         # rank. With heterogeneous TP, prepare the descriptors by splitting the
         # P KV cache along kv_head dim, of D worker's kv_head size (D>P).
         # Eg. PTP1 DTP2 => P0 KV:[block0-KV_0 | block0-KV_1..].
-        p_remote_tp_rank = self.tp_rank // tp_ratio
         # Only register the remote's descriptors if current rank pulls from it.
-        if p_remote_tp_rank == remote_tp_rank:
-            self.kv_caches_base_addr[
-                engine_id] = nixl_agent_meta.kv_caches_base_addr
-            rank_offset = self.tp_rank % tp_ratio * self.block_len \
-                if not self.use_mla else 0
-            # Register all remote blocks, but only the corresponding kv heads.
-            for base_addr in nixl_agent_meta.kv_caches_base_addr:
-                for block_id in range(nixl_agent_meta.num_blocks):
-                    block_offset = block_id * nixl_agent_meta.block_len
-                    # For each block, grab the heads chunk belonging to rank_i
-                    # of size remote_nheads // tp_ratio, which correspond to
-                    # self.block_len == remote_block_len//tp_ratio bytes.
-                    addr = base_addr + block_offset + rank_offset
-                    # (addr, len, device id)
-                    blocks_data.append((addr, self.block_len, remote_tp_rank))
+        self.kv_caches_base_addr[
+            engine_id] = nixl_agent_meta.kv_caches_base_addr
+        rank_offset = self.tp_rank % tp_ratio * nixl_agent_meta.block_len // tp_ratio \
+            if not (self.use_mla or is_kv_replicated) else 0
+        # Register all remote blocks, but only the corresponding kv heads.
+        for base_addr in nixl_agent_meta.kv_caches_base_addr:
+            for block_id in range(nixl_agent_meta.num_blocks):
+                block_offset = block_id * nixl_agent_meta.block_len
+                # For each block, grab the heads chunk belonging to rank_i
+                # of size remote_nheads // tp_ratio, which correspond to
+                # self.block_len == remote_block_len//tp_ratio bytes.
+                addr = base_addr + block_offset + rank_offset
+                # (addr, len, device id)
+                blocks_data.append((addr, nixl_agent_meta.block_len//tp_ratio, remote_tp_rank))
+        logger.debug(
+            "Created %s blocks for dst engine %s with remote rank %s and "
+            "local rank %s", len(blocks_data), engine_id, remote_tp_rank,
+            self.tp_rank)
+        logger.debug(f'buke {self.slot_size_bytes=}|{tp_ratio=}|{self.block_len=}|{nixl_agent_meta.block_len=}|{self.tp_rank=}|{self._use_flashinfer=}')
+        # Register with NIXL.
+        descs = self.nixl_wrapper.get_xfer_descs(blocks_data,
+                                                 self.remote_nixl_memory_type)
+        #print('buke register remote:', len(blocks_data), blocks_data[:10],blocks_data[-1],self.nixl_memory_type)
+        self.dst_xfer_side_handles[
+            engine_id] = self.nixl_wrapper.prep_xfer_dlist(
+                remote_agent_name, descs)
+
+        return remote_agent_name
+
+    def sync_recved_kv_to_device(self, req_id: str, meta: ReqMeta):
+        """copy recved kv from host buffer to device."""
+        assert self.use_host_buffer
+        assert self.copy_blocks is not None
+
+        local_block_ids = meta.local_block_ids
+        self.copy_blocks(self.block_size, self.host_xfer_buffers, self.device_kv_caches,
+                         local_block_ids, local_block_ids, "h2d")
+        if logger.isEnabledFor(logging.DEBUG):
             logger.debug(
-                "Created %s blocks for dst engine %s with remote rank %s and "
-                "local rank %s", len(blocks_data), engine_id, remote_tp_rank,
-                self.tp_rank)
-
-            # Register with NIXL.
-            descs = self.nixl_wrapper.get_xfer_descs(blocks_data, "VRAM")
-            self.dst_xfer_side_handles[
-                engine_id] = self.nixl_wrapper.prep_xfer_dlist(
-                    self._remote_agents[engine_id][remote_tp_rank], descs)
+                "synced recved kv of request[%s] to device kv buffer,"
+                "local_block_ids: %s. ", req_id,
+                ",".join(map(str, meta.local_block_ids)))
+
+    def save_kv_to_host(self, metadata: NixlConnectorMetadata):
+        """copy kv from device to host buffer."""
+        assert self.use_host_buffer
+        assert self.copy_blocks is not None
+
+        for req_id, meta in metadata.reqs_to_save.items():
+            if req_id not in self.req_send_time.keys():
+                self.req_send_time[req_id] = time.perf_counter()
+                # logger.info(f"libin debug save_kv_to_host starts{os.getenv('RANK')} {req_id=} ")
+
+            if logger.isEnabledFor(logging.DEBUG):
+                logger.debug(
+                    "save_load_kv for request[%s] to host xfer buffer."
+                    "local_block_ids: %s. ", req_id,
+                    ",".join(map(str, meta.local_block_ids)))
+            # blocking
+            self.copy_blocks(self.block_size, self.device_kv_caches, self.host_xfer_buffers,
+                             meta.local_block_ids, meta.local_block_ids, "d2h")
+            # logger.info(f"libin debug save_kv_to_host {os.getenv('RANK')} time:{time.perf_counter()-self.req_send_time[req_id]}")
 
     def get_finished(self) -> tuple[set[str], set[str]]:
         """
-        Get requests that are done sending or recving.
-
-        In TP>1 setup, each rank exchanges KVs with its counterpart
-        ranks independently. get_finished() runs in a worker creates
-        the done_sending and done_recving sets that are sent to the
-        scheduler via ModelRunnerOutput by Rank 0. To ensure trnxs
-        are done before adding to finished, Ranks 1 to N-1 communicate
-        to Rank 0 once their transaction is done + Rank 0 returns
-        finished sets to Scheduler only once all ranks are done.
+        Get requests that are done sending or recving on this specific worker.
+        The scheduler process (via the MultiprocExecutor) will use this output
+        to track which workers are done.
         """
+        s1 = time.perf_counter()
         done_sending = self._get_new_notifs()
         done_recving = self._pop_done_transfers(self._recving_transfers)
         if len(done_sending) > 0 or len(done_recving) > 0:
@@ -763,51 +1118,50 @@ def get_finished(self) -> tuple[set[str], set[str]]:
                 "Rank %s, get_finished: %s requests done sending "
                 "and %s requests done recving", self.tp_rank,
                 len(done_sending), len(done_recving))
-
-        if self.world_size == 1:
-            return done_sending, done_recving
-
-        # Rank 0: get finished from all other ranks.
-        if self.tp_rank == 0:
-            for req_id in done_sending:
-                self._done_sending_count[req_id] += 1
+        if self.is_hetero and self.kv_buffer_device == "hpu":
+            #import remote_pdb; remote_pdb.set_trace()
+            t1 = time.perf_counter()
+            remote_block_size = self.block_size // self.block_factor
+            block_size, n_kv_heads, head_dim = self.block_shape
             for req_id in done_recving:
-                self._done_recving_count[req_id] += 1
-
-            # Keep track of how many other ranks have finished.
-            other_ranks_finished_ids: list[str] = []
-            for i in range(1, self.world_size):
-                other_ranks_finished_ids.extend(
-                    self.tp_group.recv_object(src=i))
-            for req_id in other_ranks_finished_ids:
-                if (req_id in self._done_recving_count
-                        or req_id in self._recving_transfers):
-                    self._done_recving_count[req_id] += 1
-                else:
-                    self._done_sending_count[req_id] += 1
-
-            # Return ids that finished on all ranks to the scheduler.
-            all_done_recving: set[str] = set()
-            for req_id in list(self._done_recving_count.keys()):
-                if self._done_recving_count[req_id] == self.world_size:
-                    del self._done_recving_count[req_id]
-                    all_done_recving.add(req_id)
-
-            all_done_sending: set[str] = set()
-            for req_id in list(self._done_sending_count.keys()):
-                if self._done_sending_count[req_id] == self.world_size:
-                    del self._done_sending_count[req_id]
-                    all_done_sending.add(req_id)
-
-            return all_done_sending, all_done_recving
-
-        # Ranks 1 to N-1: send finished ids to Rank 0.
-        else:
-            finished_req_ids = list(done_recving.union(done_sending))
-            self.tp_group.send_object(finished_req_ids, dst=0)
-
-            # Unused as only Rank 0 results are sent to scheduler.
-            return done_sending, done_recving
+                #print(req_id, self._recving_metadata)
+                meta = self._recving_metadata.pop(req_id)
+                for k, v in self.device_kv_caches.values():
+                    local_block_ids = meta.local_block_ids
+                    #print(f'buke {local_block_ids=}|{k.shape=}')
+                    for block_idx in local_block_ids:
+                        #import remote_pdb; remote_pdb.set_trace()
+                        k[block_idx*self.block_size: (1+block_idx)*self.block_size] = k[block_idx*self.block_size: (1+block_idx)*self.block_size].reshape(self.block_factor, n_kv_heads, remote_block_size, head_dim).permute(0,2,1,3).contiguous().reshape(self.block_size,n_kv_heads,head_dim)
+                        v[block_idx*self.block_size: (1+block_idx)*self.block_size] = v[block_idx*self.block_size: (1+block_idx)*self.block_size].reshape(self.block_factor, n_kv_heads, remote_block_size, head_dim).permute(0,2,1,3).contiguous().reshape(self.block_size,n_kv_heads,head_dim)
+                #import remote_pdb; remote_pdb.set_trace()
+                t2 = time.perf_counter()
+                tt = t2-t1
+                logger.debug(f'buke permute time:{tt}, {req_id=}|{self._recving_metadata=}')
+
+        if self.use_host_buffer:
+            for req_id in done_recving:
+                s2 = time.perf_counter()
+                meta = self._recving_metadata.pop(req_id)
+                assert meta, f"{req_id} not found in recving_metadata list"
+                self.sync_recved_kv_to_device(req_id, meta)
+                # logger.info(f"libin debug get_finished {os.getenv('RANK')}, d2h {time.perf_counter()-s2}| {req_id=}")
+
+        # Handle timeout to avoid stranding blocks on remote.
+        now = time.perf_counter()
+        while self._reqs_to_send:
+            req_id, expires = next(iter(self._reqs_to_send.items()))
+            # Sorted dict, oldest requests are put first so we can exit early.
+            if now < expires:
+                break
+            count = self.consumer_notification_counts_by_req.pop(req_id, 0)
+            logger.warning(
+                "Releasing expired KV blocks for request %s which were "
+                "retrieved by %d decode worker(s) within %d seconds.", req_id,
+                count, envs.VLLM_NIXL_ABORT_REQUEST_TIMEOUT)
+            del self._reqs_to_send[req_id]
+            done_sending.add(req_id)
+
+        return done_sending, done_recving
 
     def _get_new_notifs(self) -> set[str]:
         """
@@ -819,12 +1173,20 @@ def _get_new_notifs(self) -> set[str]:
         for notifs in self.nixl_wrapper.get_new_notifs().values():
             for notif in notifs:
                 req_id, tp_ratio = notif.decode("utf-8").rsplit(":", 1)
+                if req_id not in self._reqs_to_send:
+                    logger.error(
+                        "Potentially invalid KV blocks for "
+                        "unrecognized request %s were retrieved by "
+                        "a decode worker. They may have expired.", req_id)
+                    continue
+
                 self.consumer_notification_counts_by_req[req_id] += 1
                 # Wait all consumers (D) to be done reading before freeing.
                 if self.consumer_notification_counts_by_req[req_id] == int(
                         tp_ratio):
                     notified_req_ids.add(req_id)
                     del self.consumer_notification_counts_by_req[req_id]
+                    del self._reqs_to_send[req_id]
         return notified_req_ids
 
     def _pop_done_transfers(
@@ -838,52 +1200,104 @@ def _pop_done_transfers(
         """
         done_req_ids: set[str] = set()
         for req_id, handles in list(transfers.items()):
-            for handle, xfer_stime in handles:
+            in_progress = False
+            for handle, _xfer_stime in handles:
                 xfer_state = self.nixl_wrapper.check_xfer_state(handle)
                 if xfer_state == "DONE":
+                    xfer_end_time = time.perf_counter()
+                    # logger.info(f"libin debug _pop_done_transfers: {req_id=}|{handle=}|{xfer_end_time=}|{xfer_end_time-_xfer_stime=}")
                     self.nixl_wrapper.release_xfer_handle(handle)
-                    done_req_ids.add(req_id)
-                    del transfers[req_id]
                 elif xfer_state == "PROC":
+                    in_progress = True
                     continue
                 else:
                     raise RuntimeError("Transfer failed with state %s",
                                        xfer_state)
+            if not in_progress:
+                done_req_ids.add(req_id)
+                del transfers[req_id]
         return done_req_ids
 
+    def rewrite_kv_based_on_transfer_layout(self, metadata: NixlConnectorMetadata):
+        if self.decoder_tp_ratio == 1:
+            return
+        t = time.perf_counter()
+        for req_id, meta in metadata.reqs_to_save.items():
+            block_ids = meta.local_block_ids
+            for k, v in self.device_kv_caches.items():
+                gb, h, d = v[0].shape
+                indices = torch.tensor(block_ids, device=v[0].device)
+                gbhd = [int(gb/self.block_size), self.block_size, h, d]
+                for i in range(len(self.device_kv_caches[k])):
+                    kv = v[i].reshape(gbhd)
+                    kv_selected  = torch.index_select(kv, 0, indices)
+                    bc, bs, h, d  = kv_selected.shape
+                    shape = int(bs*h/self.decoder_tp_ratio*d)
+                    blocks = torch.chunk(kv_selected, 2, dim=2)
+                    vecs = [b.reshape([bc, shape]) for b in blocks]
+                    kv_selected = torch.concat(vecs, dim=1).reshape(kv_selected.shape)
+                    kv.index_copy_(dim=0, index=indices, source=kv_selected)
+        if len(metadata.reqs_to_save) > 0:
+            torch.hpu.synchronize()
+        logger.debug(f"rewrite_kv_based_on_transfer_layout done time:{time.perf_counter() - t}")
+
     def start_load_kv(self, metadata: NixlConnectorMetadata):
         """
         Start loading by triggering non-blocking nixl_xfer.
         We check for these trnxs to complete in each step().
         """
-        for req_id, meta in metadata.requests.items():
+        s1 = time.perf_counter()
+        #logger.info(f'libin debug start_load_kv, {os.getenv('RANK')}')
+        for req_id, meta in metadata.reqs_to_recv.items():
+            if req_id not in self.req_recv_time.keys():
+                self.req_recv_time[req_id] = time.perf_counter()
+                # logger.info(f"libin debug start_load_kv starts {os.getenv('RANK')} for {req_id=}")
+            remote_engine_id = meta.remote_engine_id
             logger.debug(
                 "start_load_kv for request %s from remote engine %s. "
                 "Num local_block_ids: %s. Num remote_block_ids: %s. ", req_id,
-                meta.remote_engine_id, len(meta.local_block_ids),
+                remote_engine_id, len(meta.local_block_ids),
                 len(meta.remote_block_ids))
-            self._read_blocks(
-                request_id=req_id,
-                dst_engine_id=meta.remote_engine_id,
-                local_block_ids=meta.local_block_ids,
-                remote_block_ids=meta.remote_block_ids,
-                remote_host=meta.remote_host,
-                remote_port=meta.remote_port,
-            )
-
-    def _read_blocks(
-        self,
-        local_block_ids: list[int],
-        remote_block_ids: list[int],
-        remote_host: str,
-        remote_port: int,
-        dst_engine_id: str,
-        request_id: str,
-    ):
-        # NOTE(rob): this takes ~2s. We need to get this off the hotpath.
-        if dst_engine_id not in self._remote_agents:
-            self._nixl_handshake(remote_host, remote_port)
+            if self.is_hetero or self.use_host_buffer:
+                self._recving_metadata[req_id] = meta
+            if remote_engine_id not in self._remote_agents:
+                # Initiate handshake with remote engine to exchange metadata.
+                with self._handshake_lock:
+                    if remote_engine_id not in self._remote_agents:
+                        s2 = time.perf_counter()
+                        # logger.info(f"libin debug start_load_kv  {os.getenv('RANK')}, start handleshake before {s2 - s1} {req_id=}")
+                        self._background_nixl_handshake(
+                            req_id, remote_engine_id, meta)
+                        continue
+            s3 = time.perf_counter()
+            #logger.info(f'libin debug _read_blocks_for_req start {os.getenv('RANK')}, {req_id=} handshake time {s3-self.req_recv_time[req_id]}')
+            # Handshake already completed, start async read xfer.
+            self._read_blocks_for_req(req_id, meta)
+            #logger.info(f'libin debug _read_blocks_for_req end {os.getenv('RANK')}, {req_id=} async transfer {time.perf_counter() - s3}')
+        # Start transfers for requests whose handshakes have now finished.
+        while not self._ready_requests.empty():
+            s4 = time.perf_counter()
+            #logger.info(f'libin debug _read_blocks_for_req1 start {os.getenv('RANK')}')
+            self._read_blocks_for_req(*self._ready_requests.get_nowait())
+            #logger.info(f'libin debug _read_blocks_for_req1 end {os.getenv('RANK')} async transfer: {time.perf_counter() - s4}')
+        # Add to requests that are waiting to be read and track expiration.
+        self._reqs_to_send.update(metadata.reqs_to_send)
+
+    def _read_blocks_for_req(self, req_id: str, meta: ReqMeta):
+        logger.debug(
+            "Remote agent %s available, calling _read_blocks for req %s",
+            meta.remote_engine_id, req_id)
+        self._read_blocks(
+            request_id=req_id,
+            dst_engine_id=meta.remote_engine_id,
+            local_block_ids=meta.local_block_ids,
+            remote_block_ids=meta.remote_block_ids,
+            is_mem_hit=meta.is_mem_hit,
+        )
 
+    def _read_blocks(self, local_block_ids: list[int],
+                     remote_block_ids: list[int], dst_engine_id: str,
+                     request_id: str, is_mem_hit: bool = False):
         # NOTE(rob): having the staging blocks be on the READER side is
         # not going to work well (since we will have to call rearrange tensors).
         # after we detect the txn is complete (which means we cannot make the
@@ -912,8 +1326,8 @@ def _read_blocks(
         # Partial prefix cache hit: just read uncomputed blocks.
         num_remote_blocks = len(remote_block_ids)
         assert num_local_blocks <= num_remote_blocks
-        if num_local_blocks < num_remote_blocks:
-            remote_block_ids = remote_block_ids[-num_local_blocks:]
+        #if num_local_blocks < num_remote_blocks:
+        #    remote_block_ids = remote_block_ids[-num_local_blocks:]
 
         # Get side handles.
         local_xfer_side_handle = self.src_xfer_side_handle
@@ -926,6 +1340,22 @@ def _read_blocks(
         # Get descs ids.
         local_block_descs_ids: list[int] = []
         remote_block_descs_ids: list[int] = []
+
+        if self.block_factor > 1:
+            local_sub_block_ids = [b for x in local_block_ids for b in range(x * self.block_factor, (x + 1) * self.block_factor)]
+            assert len(local_sub_block_ids) <= len(remote_block_ids)
+            valid_len = len(local_sub_block_ids)
+            logger.debug(f'buke {local_block_ids=} |{remote_block_ids=} |{valid_len=} |{len(remote_block_ids)}')
+            if is_mem_hit:
+                remote_block_ids = remote_block_ids[-valid_len:]
+            else:
+                remote_block_ids = remote_block_ids[:valid_len]
+            local_block_ids = local_sub_block_ids[:valid_len]
+            logger.debug(f'buke {local_block_ids=} |{remote_block_ids=} |{local_sub_block_ids=} | {is_mem_hit=}')
+        else:
+            if num_local_blocks < num_remote_blocks:
+                remote_block_ids = remote_block_ids[-num_local_blocks:]
+
         if not self.block_window_per_layer:
             # Default case: assume global attention
             remote_block_descs_ids = self._get_block_descs_ids(
@@ -1017,7 +1447,6 @@ def zmq_ctx(socket_type: Any, addr: str) -> Iterator[zmq.Socket]:
 
     if socket_type not in (zmq.ROUTER, zmq.REQ):
         raise ValueError(f"Unexpected socket type: {socket_type}")
-
     ctx: Optional[zmq.Context] = None
     try:
         ctx = zmq.Context()  # type: ignore[attr-defined]
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 59971f5d65af..29af3ddcb391 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -267,6 +267,7 @@ class _AsyncLLMEngine(LLMEngine):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
+    
     async def step_async(
         self, virtual_engine: int
     ) -> List[Union[RequestOutput, PoolingRequestOutput]]:
@@ -281,6 +282,7 @@ async def step_async(
         """
         # these are cached outputs from previous iterations. None if on first
         # iteration
+        s1 = time.perf_counter()
         cached_outputs = self.cached_scheduler_outputs[virtual_engine]
         seq_group_metadata_list = cached_outputs.seq_group_metadata_list
         scheduler_outputs = cached_outputs.scheduler_outputs
@@ -352,11 +354,11 @@ async def step_async(
             if allow_async_output_proc:
                 execute_model_req.async_callback = self.async_callbacks[
                     virtual_engine]
-
+            s2 = time.perf_counter()
             # Execute the model.
             outputs = await self.model_executor.execute_model_async(
                 execute_model_req)
-
+            s3 = time.perf_counter()
             # we need to do this here so that last step's sampled_token_ids can
             # be passed to the next iteration for PP.
             if self.scheduler_config.is_multi_step:
@@ -416,7 +418,7 @@ async def step_async(
             if len(ctx.output_queue) > 0:
                 self._process_model_outputs(ctx=ctx)
             assert len(ctx.output_queue) == 0
-
+        s4 = time.perf_counter()
         return ctx.request_outputs
 
     async def stop_remote_worker_execution_loop_async(self) -> None:
@@ -481,6 +483,7 @@ async def add_request_async(
     ) -> None:
         """Async version of
         [`add_request`][vllm.engine.llm_engine.LLMEngine.add_request]."""
+        s1 = time.perf_counter()
         if inputs is not None:
             prompt = inputs
         assert prompt is not None and params is not None
@@ -533,6 +536,7 @@ async def add_request_async(
             priority=priority,
         )
 
+
     async def check_health_async(self) -> None:
         self.model_executor.check_health()
 
@@ -834,7 +838,8 @@ async def run_engine_loop(engine_ref: ReferenceType):
         has_requests_in_progress = [False] * pipeline_parallel_size
         while True:
             if not any(has_requests_in_progress):
-                logger.debug("Waiting for new requests...")
+                s1 = time.perf_counter()
+                logger.info("Waiting for new requests...")
                 # Stop the execute model loop in parallel workers until there
                 # are more requests to process. This avoids waiting
                 # indefinitely in torch.distributed ops which may otherwise
@@ -849,15 +854,19 @@ async def run_engine_loop(engine_ref: ReferenceType):
                 await asyncio.sleep(0)
                 if engine_ref() is None:
                     return
+
                 await request_tracker.wait_for_new_requests()
                 engine = engine_ref()
                 if not engine:
                     return
-                logger.debug("Got new requests!")
+                s2 = time.perf_counter()
+                logger.info("Got new requests!")
                 requests_in_progress = [
                     asyncio.create_task(engine.engine_step(ve))
                     for ve in range(pipeline_parallel_size)
                 ]
+                s3 = time.perf_counter()
+                logger.info("Processing engine step for new requests!")
                 has_requests_in_progress = [True] * pipeline_parallel_size
 
             # Abort if iteration takes too long due to unrecoverable errors
@@ -870,6 +879,8 @@ async def run_engine_loop(engine_ref: ReferenceType):
                     for _ in range(pipeline_parallel_size):
                         await asyncio.sleep(0)
                 for task in done:
+                    s4 = time.perf_counter()
+                    logger.info("Done engine step for new requests 1st step! {os.getnev('RANK)}| total:{t4-t1}| req:{s2-s1}| exe:{ s3-s2}| out:{s4-s3}")
                     result = task.result()
                     virtual_engine = requests_in_progress.index(task)
                     has_unfinished_requests = (
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 2f8819bca60d..3259694bf5f2 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -8,7 +8,7 @@
 import inspect
 import json
 import multiprocessing
-import os
+import os,time
 import signal
 import socket
 import tempfile
@@ -586,13 +586,15 @@ async def create_chat_completion(request: ChatCompletionRequest,
 @with_cancellation
 @load_aware_call
 async def create_completion(request: CompletionRequest, raw_request: Request):
+    s1 = time.perf_counter()
     handler = completion(raw_request)
     if handler is None:
         return base(raw_request).create_error_response(
             message="The model does not support Completions API")
-
+    #import remote_pdb;remote_pdb.set_trace()
     try:
         generator = await handler.create_completion(request, raw_request)
+        s2 = time.perf_counter()
     except OverflowError as e:
         raise HTTPException(status_code=HTTPStatus.BAD_REQUEST.value,
                             detail=str(e)) from e
@@ -601,12 +603,24 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
                             detail=str(e)) from e
 
     if isinstance(generator, ErrorResponse):
-        return JSONResponse(content=generator.model_dump(),
+        re = JSONResponse(content=generator.model_dump(),
                             status_code=generator.code)
+        s3 = time.perf_counter()
+        # logger.info(f"libin debug create_completion 1 my_rank:{os.getenv('RANK')} takes: {s3-s1}")
+        return re
+        
     elif isinstance(generator, CompletionResponse):
-        return JSONResponse(content=generator.model_dump())
+     
+        re = JSONResponse(content=generator.model_dump())
+        s3 = time.perf_counter()
+        # logger.info(f"libin debug create_completion 2 my_rank:{os.getenv('RANK')} takes: {s3-s1}| {s3-s2=}")
+        return re        
 
-    return StreamingResponse(content=generator, media_type="text/event-stream")
+    re = StreamingResponse(content=generator, media_type="text/event-stream")
+    s3 = time.perf_counter()
+
+    # logger.info(f"libin debug create_completion 3 my_rank:{os.getenv('RANK')} takes: {s3-s1}|| {s3-s2=}")
+    return re
 
 
 @router.post("/v1/embeddings",
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index ce5eca855028..ddbb42958b82 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
-import time
+import time,os
 from collections.abc import AsyncGenerator, AsyncIterator
 from collections.abc import Sequence as GenericSequence
 from typing import Optional, Union, cast
@@ -80,6 +80,8 @@ async def create_completion(
             - suffix (the language models we currently support do not support
             suffix)
         """
+        # logger.info(f"libin enter create_completion my rank:{os.getenv('RANK')}")
+        s1 = time.perf_counter()
         error_check_ret = await self._check_model(request)
         if error_check_ret is not None:
             return error_check_ret
@@ -101,7 +103,8 @@ async def create_completion(
 
         request_id = f"cmpl-{self._base_request_id(raw_request)}"
         created_time = int(time.time())
-
+        s2 = time.perf_counter()
+        # logger.info(f"libin create_completion my rank:{os.getenv('RANK')} create req {request_id}")
         request_metadata = RequestResponseMetadata(request_id=request_id)
         if raw_request:
             raw_request.state.request_metadata = request_metadata
@@ -133,7 +136,8 @@ async def create_completion(
         except jinja2.TemplateError as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
-
+        s3 = time.perf_counter()
+        # logger.info(f"libin create_completion my rank:{os.getenv('RANK')} schedule {request_id}")
         # Schedule the request and get the result generator.
         generators: list[AsyncGenerator[RequestOutput, None]] = []
         try:
@@ -204,9 +208,9 @@ async def create_completion(
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
-
+        s4 = time.perf_counter()
         result_generator = merge_async_iterators(*generators)
-
+        s5 = time.perf_counter()
         model_name = self._get_model_name(request.model, lora_request)
         num_prompts = len(engine_prompts)
 
@@ -216,10 +220,11 @@ async def create_completion(
         stream = (request.stream
                   and (request.best_of is None or request.n == request.best_of)
                   and not request.use_beam_search)
-
+        s6 = time.perf_counter()
+        # logger.info(f"libin create_completion my rank:{os.getenv('RANK')} creating result {request_id}")
         # Streaming response
         if stream:
-            return self.completion_stream_generator(
+            ret =  self.completion_stream_generator(
                 request,
                 result_generator,
                 request_id,
@@ -228,6 +233,8 @@ async def create_completion(
                 num_prompts=num_prompts,
                 tokenizer=tokenizer,
                 request_metadata=request_metadata)
+            # logger.info(f"libin create_completion done response stream my rank:{os.getenv('RANK')} time:{s6-s1}| b_schedure:{s3-s1}| schedule:{s4-s3}| merge:{s5-s4}| {s6-s5=}")
+            return ret
 
         # Non-streaming response
         final_res_batch: list[Optional[RequestOutput]] = [None] * num_prompts
@@ -274,9 +281,11 @@ async def create_completion(
             async def fake_stream_generator() -> AsyncGenerator[str, None]:
                 yield f"data: {response_json}\n\n"
                 yield "data: [DONE]\n\n"
-
+            s7 = time.perf_counter()
+            # logger.info(f"libin create_completion done response fake stream my rank:{os.getenv('RANK')} time:{s7-s1}| b_schedure:{s3-s1}| schedule:{s4-s3}| merge:{s5-s4}| {s6-s5=}| result:{s7-s6}")
             return fake_stream_generator()
-
+        s7 = time.perf_counter()
+        # logger.info(f"libin create_completion done response non-stream my rank:{os.getenv('RANK')} time:{s7-s1}| b_schedure:{s3-s1}| schedule:{s4-s3}| merge:{s5-s4}| {s6-s5=}| result:{s7-s6}")
         return response
 
     async def completion_stream_generator(
@@ -290,6 +299,8 @@ async def completion_stream_generator(
         tokenizer: AnyTokenizer,
         request_metadata: RequestResponseMetadata,
     ) -> AsyncGenerator[str, None]:
+        s1 = time.perf_counter()
+        # logger.info(f"libin enter completion_stream_generator my rank:{os.getenv('RANK')}")
         num_choices = 1 if request.n is None else request.n
         previous_text_lens = [0] * num_choices * num_prompts
         previous_num_tokens = [0] * num_choices * num_prompts
@@ -306,6 +317,7 @@ async def completion_stream_generator(
 
         try:
             async for prompt_idx, res in result_generator:
+                # logger.info(f"libin result_generator completion_stream_generator my rank:{os.getenv('RANK')} {time.perf_counter()- s1}")
                 prompt_token_ids = res.prompt_token_ids
                 prompt_logprobs = res.prompt_logprobs
                 prompt_text = res.prompt
@@ -396,6 +408,7 @@ async def completion_stream_generator(
                         )
 
                     response_json = chunk.model_dump_json(exclude_unset=False)
+                    # logger.info(f"libin debug process output not end my rank:{os.getenv('RANK')}")
                     yield f"data: {response_json}\n\n"
 
             total_prompt_tokens = sum(num_prompt_tokens)
@@ -415,6 +428,7 @@ async def completion_stream_generator(
                 )
                 final_usage_data = (final_usage_chunk.model_dump_json(
                     exclude_unset=False, exclude_none=True))
+                # logger.info(f"libin debug process output 1 end my rank:{os.getenv('RANK')}")
                 yield f"data: {final_usage_data}\n\n"
 
             # report to FastAPI middleware aggregate usage across all choices
@@ -424,6 +438,7 @@ async def completion_stream_generator(
             # TODO: Use a vllm-specific Validation Error
             data = self.create_streaming_error_response(str(e))
             yield f"data: {data}\n\n"
+        # logger.info(f"libin debug process output end my rank:{os.getenv('RANK')}")
         yield "data: [DONE]\n\n"
 
     def request_output_to_completion_response(
diff --git a/vllm/envs.py b/vllm/envs.py
index a3e8f16640b2..9988531b50ab 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -121,10 +121,12 @@
     VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False
     VLLM_NIXL_SIDE_CHANNEL_HOST: str = "localhost"
     VLLM_NIXL_SIDE_CHANNEL_PORT: int = 5557
+    VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 120
     VLLM_ALL2ALL_BACKEND: str = "naive"
     VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840
     VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1
     VLLM_SLEEP_WHEN_IDLE: bool = False
+    VLLM_HPU_FORCE_MARK_STEP: bool = True
 
 
 def get_default_cache_root():
@@ -739,6 +741,11 @@ def get_vllm_port() -> Optional[int]:
     "VLLM_HPU_USE_DELAYED_SAMPLING":
     lambda: os.environ.get("VLLM_DELAYED_SAMPLING", "false").lower() in
     ("1", "true"),
+    
+    # Do mark_step for HPU to split hpu graph or prevent fused kernel
+    "VLLM_HPU_FORCE_MARK_STEP":
+    lambda: os.environ.get("VLLM_HPU_FORCE_MARK_STEP", "true").lower() in
+    ("1", "true"),
 
     # Converts model weights to FP8UZ format.
     "VLLM_HPU_CONVERT_TO_FP8UZ":
@@ -832,6 +839,13 @@ def get_vllm_port() -> Optional[int]:
     "VLLM_NIXL_SIDE_CHANNEL_HOST":
     lambda: os.getenv("VLLM_NIXL_SIDE_CHANNEL_HOST", "localhost"),
 
+    # Time (in seconds) after which the KV cache on the producer side is
+    # automatically cleared if no READ notification is received from the
+    # consumer. This is only applicable when using NixlConnector in a
+    # disaggregated decode-prefill setup.
+    "VLLM_NIXL_ABORT_REQUEST_TIMEOUT":
+    lambda: int(os.getenv("VLLM_NIXL_ABORT_REQUEST_TIMEOUT", "120")),
+
     # Port used for NIXL handshake between remote agents.
     "VLLM_NIXL_SIDE_CHANNEL_PORT":
     lambda: int(os.getenv("VLLM_NIXL_SIDE_CHANNEL_PORT", "5557")),
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index fdf7eeea4ddd..c933f845ccff 100755
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -266,6 +266,7 @@ def forward(
             # between decode runs.
             self._init_sampling_tensors(logits, sampling_metadata)
 
+        self.skip_softmax_for_greedy = sampling_metadata.skip_softmax_for_greedy
         assert self._sampling_tensors is not None
         sampling_tensors = self._sampling_tensors
         do_penalties = self._do_penalties
@@ -284,6 +285,10 @@ def forward(
 
         # Use float32 to apply temperature scaling.
         # Use in-place division to avoid creating a new tensor.
+        if not self.skip_softmax_for_greedy:
+            logits = logits.to(torch.float)
+            logits.div_(sampling_tensors.temperatures.unsqueeze(dim=1))
+        
         logits = logits.to(torch.float)
         logits.div_(sampling_tensors.temperatures.unsqueeze(dim=1))
 
@@ -302,9 +307,11 @@ def forward(
 
         # We use float32 for probabilities and log probabilities.
         # Compute the probabilities.
-        probs = torch.softmax(logits, dim=-1, dtype=torch.float)
+        probs = logits if self.skip_softmax_for_greedy \
+            else torch.softmax(logits, dim=-1, dtype=torch.float)
         # Compute the log probabilities.
-        logprobs = torch.log_softmax(logits, dim=-1, dtype=torch.float)
+        logprobs = logits if self.skip_softmax_for_greedy \
+            else torch.log_softmax(logits, dim=-1, dtype=torch.float)
 
         # Sample the next tokens.
         maybe_deferred_sample_results, maybe_sampled_tokens_tensor = _sample(
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index be3f2cd53c9f..f692bdeeb27d 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -30,7 +30,7 @@
 import torch
 from torch import nn
 from transformers import LlamaConfig
-
+import vllm.envs as envs
 from vllm.attention import Attention, AttentionType
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
@@ -374,6 +374,7 @@ def __init__(self,
 
         self.config = config
         self.quant_config = quant_config
+        self.do_mark_step = envs.VLLM_HPU_FORCE_MARK_STEP
         lora_vocab = (lora_config.lora_extra_vocab_size *
                       (lora_config.max_loras or 1)) if lora_config else 0
         self.vocab_size = config.vocab_size + lora_vocab
@@ -431,7 +432,7 @@ def forward(
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        if is_hpu:
+        if is_hpu and self.do_mark_step:
             import habana_frameworks.torch as htorch
             htorch.core.mark_step()
 
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index c7f88b8d91d7..e901cb81ff66 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -139,6 +139,7 @@ def __init__(
         num_prompts: int,
         skip_sampler_cpu_output: bool = False,
         reuse_sampling_tensors: bool = False,
+        skip_softmax_for_greedy: bool = False,
     ) -> None:
         self.seq_groups = seq_groups
         self.selected_token_indices = selected_token_indices
@@ -146,6 +147,7 @@ def __init__(
         self.num_prompts = num_prompts
         self.skip_sampler_cpu_output = skip_sampler_cpu_output
         self.reuse_sampling_tensors = reuse_sampling_tensors
+        self.skip_softmax_for_greedy = skip_softmax_for_greedy
 
     @staticmethod
     def prepare(
@@ -162,6 +164,7 @@ def prepare(
             selected_token_indices,
             categorized_sample_indices,
             num_prompts,
+            skip_softmax_for_greedy,
         ) = _prepare_seq_groups(seq_group_metadata_list, seq_lens, query_lens,
                                 device, generators, cache)
         selected_token_indices = async_tensor_h2d(
@@ -186,7 +189,9 @@ def prepare(
             selected_token_indices=selected_token_indices,
             categorized_sample_indices=categorized_sample_indices,
             num_prompts=num_prompts,
+            skip_softmax_for_greedy=skip_softmax_for_greedy,
         )
+
         return sampling_metadata
 
     def __repr__(self) -> str:
@@ -209,6 +214,7 @@ def _prepare_seq_groups(
         list[int],
         dict[SamplingType, list[int]],
         int,
+        bool,
 ]:
     """Prepare sequence groups and indices for sampling.
 
@@ -249,6 +255,11 @@ def _prepare_seq_groups(
     logit_idx = 0
     # Total number of prompts from given sequence groups.
     num_prompts = 0
+    
+    # This is used to skip softmax for greedy sampling.
+    # initial value is True, once we hit one non-greedy sampling type
+    # or with logprobs, we will set it to False.
+    skip_softmax_for_greedy = True
 
     for i, seq_group_metadata in enumerate(seq_group_metadata_list):
         seq_ids = seq_group_metadata.seq_data.keys()
@@ -347,6 +358,17 @@ def sample(logits):
             categorized_sample_indices[sampling_params.sampling_type].extend(
                 list(range(logit_idx, logit_idx + sample_len)))
             logit_idx += sample_len
+        if skip_softmax_for_greedy:
+            # If we detect non_greedy in seq_group_metadata, we will
+            # set skip_softmax_for_greedy to False.
+            skip_softmax_for_greedy = \
+                sampling_params.sampling_type == SamplingType.GREEDY
+            skip_softmax_for_greedy = skip_softmax_for_greedy and \
+                (sampling_params.logprobs is None or \
+                    sampling_params.logprobs == 0)
+            skip_softmax_for_greedy = skip_softmax_for_greedy and \
+                (sampling_params.prompt_logprobs is None or \
+                    sampling_params.prompt_logprobs == 0)
 
         if cache is not None:
             sample_obj.sampling_params = sampling_params
@@ -374,7 +396,7 @@ def sample(logits):
         cache.reset()
 
     return (seq_groups, selected_token_indices, categorized_sample_indices,
-            num_prompts)
+            num_prompts, skip_softmax_for_greedy)
 
 
 @dataclass
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
index 6e29cddf0fc9..32b358042877 100644
--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -129,7 +129,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
     @classmethod
     def is_pin_memory_available(cls):
         logger.warning("Pin memory is not supported on HPU.")
-        return False
+        return True
 
     @classmethod
     def get_punica_wrapper(cls) -> str:
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index fc701215ba5d..2d6ce01dd839 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -68,7 +68,7 @@ def __init__(
         self,
         kv_cache_config: KVCacheConfig,
         max_model_len: int,
-        enable_caching: bool = True,
+        enable_caching: bool = False,
         caching_hash_algo: str = "builtin",
         use_eagle: bool = False,
         log_stats: bool = False,
@@ -141,6 +141,12 @@ def get_computed_blocks(self,
                 - A list of blocks that are computed for the request.
                 - The number of computed tokens.
         """
+                # Request already has blocks from async load via KVConnector.
+        # num_existing_blocks = len(
+        #     self.req_to_blocks[request.request_id])
+        # if num_existing_blocks > 0:
+        #     return KVCacheBlocks.create_empty(), request.num_computed_tokens
+
         # Prefix caching is disabled or
         # When the request requires prompt logprobs, we skip prefix caching.
         if (not self.enable_caching
@@ -381,9 +387,10 @@ def get_block_ids(self, request_id: str) -> list[list[int]]:
 
     def cache_blocks(self, request: Request, block_hashes: list[BlockHash],
                      num_computed_tokens: int) -> None:
-        """Cache the blocks for the request."""
-        self.coordinator.cache_blocks(request, block_hashes,
-                                      num_computed_tokens)
+        """Cache the blocks for the request, if enabled."""
+        if self.enable_caching:
+            self.coordinator.cache_blocks(request, block_hashes,
+                                          num_computed_tokens)
 
     def create_empty_block_list(self) -> KVCacheBlocks:
         """Creates a new KVCacheBlocks instance with no blocks."""
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 089f15aee5b0..89b22f1ca7bb 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -40,6 +40,8 @@
 from vllm.v1.metrics.prometheus import shutdown_prometheus
 from vllm.v1.metrics.stats import IterationStats, SchedulerStats
 
+import time, os
+
 logger = init_logger(__name__)
 
 
@@ -302,11 +304,12 @@ async def generate(
         """
 
         try:
+            s1 = time.perf_counter()
             # We start the output_handler on the first call to generate() so
             # we can call __init__ before the event loop, which enables us
             # to handle startup failure gracefully in the OpenAI server.
             self._run_output_handler()
-
+            s2 = time.perf_counter()
             q = await self.add_request(
                 request_id,
                 prompt,
@@ -317,18 +320,20 @@ async def generate(
                 priority=priority,
                 data_parallel_rank=data_parallel_rank,
             )
-
+            s3 = time.perf_counter()
             # The output_handler task pushes items into the queue.
             # This task pulls from the queue and yields to caller.
             finished = False
             while not finished:
+                s4 = time.perf_counter()
                 # Note: drain queue without await if possible (avoids
                 # task switching under load which helps performance).
                 out = q.get_nowait() or await q.get()
-
+                s5 = time.perf_counter()
                 # Note: both OutputProcessor and EngineCore handle their
                 # own request cleanup based on finished.
                 finished = out.finished
+                # logger.info(f"libin debug async_llm generate loop my rank:{os.getenv('RANK')}| time:{s5-s4}")
                 yield out
 
         # If the request is disconnected by the client, generate()
@@ -357,6 +362,7 @@ async def generate(
             if self.log_requests:
                 logger.info("Request %s failed.", request_id)
             raise EngineGenerateError() from e
+        # logger.info(f"libin debug async_llm generate my rank:{os.getenv('RANK')}| takes:{time.perf_counter()-s1}| request:{s3-s1}")
 
     def _run_output_handler(self):
         """Background loop: pulls from EngineCore and pushes to AsyncStreams."""
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index f36a491a1970..bb5e8be8f38b 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -852,10 +852,12 @@ def run_busy_loop(self):
         # Loop until process is sent a SIGINT or SIGTERM
         while True:
             # 1) Poll the input queue until there is work to do.
+            s1 = time.perf_counter()
             self._process_input_queue()
-
+            s2 = time.perf_counter()
             # 2) Step the engine core.
             executed = self._process_engine_step()
+            s3 = time.perf_counter()
             self._maybe_publish_request_counts()
 
             local_unfinished_reqs = self.scheduler.has_unfinished_requests()
@@ -867,7 +869,7 @@ def run_busy_loop(self):
                 # We are in a running state and so must execute a dummy pass
                 # if the model didn't execute any ready requests.
                 self.execute_dummy_batch()
-
+            s4 = time.perf_counter()
             # 3) All-reduce operation to determine global unfinished reqs.
             self.engines_running = self._has_global_unfinished_reqs(
                 local_unfinished_reqs)
@@ -881,7 +883,9 @@ def run_busy_loop(self):
                         (-1,
                          EngineCoreOutputs(wave_complete=self.current_wave)))
                 self.current_wave += 1
-
+            s5 = time.perf_counter()
+            # logger.info(f"libin debug core run_busy {os.getenv('RANK')| total:{s5-s1}| input:{s2-s1}| step:{s3-s2} | after_s:{s4-s3} | not_finished{s5-s4}}")
+            
     def _has_global_unfinished_reqs(self, local_unfinished: bool) -> bool:
 
         # Optimization - only perform finish-sync all-reduce every 24 steps.
diff --git a/vllm/v1/worker/hpu_model_runner.py b/vllm/v1/worker/hpu_model_runner.py
index 1bd974659349..cbca4accfe52 100644
--- a/vllm/v1/worker/hpu_model_runner.py
+++ b/vllm/v1/worker/hpu_model_runner.py
@@ -8,8 +8,8 @@
 import os
 import time
 from dataclasses import dataclass, field, fields
-from typing import TYPE_CHECKING, Any, Callable, Optional, TypeAlias, Union
-
+from typing import TYPE_CHECKING, Any, Callable, Optional, TypeAlias, Union, Literal
+import vllm.envs as envs
 import habana_frameworks.torch as htorch
 import habana_frameworks.torch.internal.bridge_config as bc
 import numpy as np
@@ -52,6 +52,10 @@
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.utils import bind_kv_cache
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
+from vllm.v1.core.sched.output import NewRequestData
+
+from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
+
 
 if TYPE_CHECKING:
     from vllm.v1.core.scheduler import SchedulerOutput
@@ -60,7 +64,9 @@
 
 _TYPE_CACHE: dict[str, dict[str, Any]] = {}
 
-
+hpu_buffer = None
+is_hetero = os.getenv('PT_HPU_ENABLE_RESTORE_KV_LAYOUT', '0') == '1'
+block_factor = int(os.getenv('PT_HPU_BLOCK_SIZE_FACTOR', '1'))
 @dataclass
 class PromptDecodeInfo:
     prompt_req_ids: list[str]
@@ -264,7 +270,7 @@ def __init__(self, model, vllm_config):
     def _get_rotary_embedding_module(self, model: torch.nn.Module):
         """
         Dynamically get the RotaryEmbedding layer in the model.
-        This function will recursively search through the module 
+        This function will recursively search through the module
         hierarchy to find and return a RotaryEmbedding layer.
         If no such layer is found, it returns None.
         """
@@ -388,6 +394,7 @@ def forward(self, *args, **kwargs):
         is_warmup = kwargs.get('warmup_mode', False)
         if 'warmup_mode' in kwargs:
             kwargs.pop('warmup_mode')
+        is_warmup = kwargs.get('is_warmup', False)
         input_ids = kwargs['input_ids']
         kwargs['attn_metadata'] = self._update_metadata(
             kwargs['attn_metadata'], input_ids.size(0), input_ids.size(1),
@@ -399,11 +406,7 @@ def forward(self, *args, **kwargs):
         if 'kv_caches' in kwargs:
             kwargs.pop('kv_caches')
         with set_forward_context(attn_meta, self.vllm_config):
-            if not is_warmup:
-                self.maybe_start_load_kv()
             hidden_states = self.model(*args, **kwargs)
-            if not is_warmup:
-                self.maybe_wait_for_kv_save()
 
             if self._rotary_prepare_cos_sin is not None:
                 self._reset_rotary_cos_sin()
@@ -555,6 +558,7 @@ def __init__(
         # on env vars... this should be fixed in the future
         self.enable_bucketing = get_config().use_bucketing
         self.use_contiguous_pa = get_config().use_contiguous_pa
+        self.do_mark_step = envs.VLLM_HPU_FORCE_MARK_STEP
         self.skip_warmup = get_config().skip_warmup
 
         model_config = self.model_config
@@ -584,7 +588,7 @@ def __init__(
             self.parallel_config)
         self.head_size = self.model_config.get_head_size()
         self.hidden_size = self.model_config.get_hidden_size()
-
+        logger.debug(f'buke model config: {self.model_config=}')
         self.attn_backend = get_attn_backend(
             self.head_size,
             self.dtype,
@@ -650,6 +654,10 @@ def __init__(
         self.profiler_counter_helper = HabanaProfilerCounterHelper()
 
         self.defragmenter = OnlineDefragmenter()
+        from datetime import datetime
+        current_timestamp = datetime.now()
+        unix_timestamp = current_timestamp.timestamp()
+        self.modelrunnerid = unix_timestamp
 
     def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
         """
@@ -850,6 +858,10 @@ def is_decoder_only(self, req_id) -> bool:
         return bool(req_id in self.input_batch.req_type and \
             self.input_batch.req_type[req_id] == "decode")
 
+    def is_prefill_only(self, req_id) -> bool:
+        return bool(req_id in self.input_batch.req_type and \
+            self.input_batch.req_type[req_id] == "prefill")
+
     def _get_prompts_and_decodes(
         self,
         scheduler_output: "SchedulerOutput",
@@ -858,49 +870,60 @@ def _get_prompts_and_decodes(
         assert total_num_scheduled_tokens > 0
         num_reqs = self.input_batch.num_reqs
         assert num_reqs > 0
+        #TODO: remove later
 
+        requests_type = {}
         if scheduler_output.kv_connector_metadata:
-            requests = scheduler_output.kv_connector_metadata.requests
+            for req in scheduler_output.kv_connector_metadata.reqs_to_save:
+                requests_type[req] = 'prefill'
+            for req in scheduler_output.kv_connector_metadata.reqs_to_recv:
+                requests_type[req] = 'decode'
+            requests = scheduler_output.kv_connector_metadata.reqs_to_save | scheduler_output.kv_connector_metadata.reqs_to_recv
         else:
             requests = None
 
         # Traverse decodes first
         decode_req_ids = []
         num_computed_tokens_decode = []
+
         for i in range(num_reqs):
             req_id = self.input_batch.req_ids[i]
             assert req_id is not None
-
+            # P case assigment
             if requests is not None and req_id not in self.input_batch.req_type:
                 for request in requests:
-                    if request.req_id == req_id:
-                        self.input_batch.req_type[req_id] = "prefill" \
-                            if request.load_spec is None else "decode"
+                    if request == req_id:
+                        self.input_batch.req_type[req_id] = requests_type[req_id]
                         break
 
             num_computed_tokens = self.input_batch.num_computed_tokens_cpu[i]
             num_prompt_tokens = self.input_batch.num_prompt_tokens[i]
             num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
                 req_id]
-
+            #logger.info(f'libin debug _get_prompts_and_decodes 1st_loop {os.getenv('RANK')} {num_reqs=} \
+            #    {num_computed_tokens=}|{num_prompt_tokens=}|{num_scheduled_tokens=}')
             if num_computed_tokens < num_prompt_tokens and \
                 not self.is_decoder_only(req_id):
                 # This is prompt
                 break
 
             # This is decode
-            if not self.is_decoder_only(req_id):
-                assert num_scheduled_tokens == 1
+            #if not self.is_decoder_only(req_id):
+                #assert num_scheduled_tokens == 1
+
             decode_req_ids.append(req_id)
             num_computed_tokens_decode.append(int(num_computed_tokens + 1))
 
+
         if self.profiler.enabled:
             self.profiler_counter_helper.capture_decode_seq_stats(
                 num_computed_tokens_decode)
 
+
         # Traverse prompts
         prompt_req_ids = []
         prompt_scheduled_tokens = []
+
         for i in range(len(decode_req_ids), num_reqs):
             req_id = self.input_batch.req_ids[i]
             assert req_id is not None
@@ -913,12 +936,10 @@ def _get_prompts_and_decodes(
             # Must be prompt
             assert num_computed_tokens < num_prompt_tokens
             num_output_tokens = len(self.requests[req_id].output_token_ids)
-            assert num_output_tokens == 0, \
-                f'req_id: {req_id}, {num_output_tokens}'
-
             prompt_req_ids.append(req_id)
             prompt_scheduled_tokens.append(num_scheduled_tokens)
-
+        #logger.info(f'libin debug _get_prompts_and_decodes after 2nd_loop {os.getenv('RANK')} {num_reqs=} {len(prompt_req_ids)=}\
+        #    |{len(decode_req_ids)=} | {num_computed_tokens=}|{num_prompt_tokens=}|{num_scheduled_tokens=}')
         return PromptDecodeInfo(prompt_req_ids, decode_req_ids,
                                 prompt_scheduled_tokens)
 
@@ -1374,8 +1395,8 @@ def _prepare_inputs(
             num_scheduled_tokens.append(seq_num_scheduled_tokens)
             num_prompt_tokens.append(seq_num_prompt_tokens)
             # NOTE: assert that all the decodes are "decodes".
-            if idx < num_decodes and not self.is_decoder_only(req_id):
-                assert seq_num_scheduled_tokens == 1
+            #if idx < num_decodes and not self.is_decoder_only(req_id):
+                #assert seq_num_scheduled_tokens == 1
         return (self._prepare_prefill_inputs(num_prefills, num_decodes,
                                              num_scheduled_tokens),
                 self._prepare_decode_inputs(num_decodes, num_scheduled_tokens))
@@ -1405,7 +1426,8 @@ def _execute_model_generic(self,
                                attn_metadata,
                                logits_indices,
                                kv_caches,
-                               warmup_mode=False):
+                               warmup_mode=False,
+                               scheduler_output = None):
 
         # FORWARD.
         batch_size = token_ids.size(0)
@@ -1582,7 +1604,8 @@ def execute_model(
         # Transfer [tokD0, tokD1, tokD2, 0, tokP0, tokP1, tokP2, 0] to CPU
         # On CPU, sanitize [tokD0, tokD1, tokD2, 0, tokP0, tokP1, tokP2, 0] -> [tokD0, tokD1, tokD2, tokP0, tokP1, tokP2] # noqa
         # Return [tokD0, tokD1, tokD2, tokP0, tokP1, tokP2]
-
+        #logger.debug(f'buke enter execute_model ||{os.getpid()=}|{scheduler_output=}')
+        s1 = time.perf_counter()
         if self.defragmenter.enabled and self.kv_caches:
             new = {
                 req.req_id: flatten(req.block_ids)
@@ -1602,7 +1625,8 @@ def execute_model(
             if not has_kv_transfer_group():
                 # Return empty ModelRunnerOuptut if there's no work to do.
                 return EMPTY_MODEL_RUNNER_OUTPUT
-
+            #logger.info(f'buke before kv_connector_no_forward |{os.getpid()=}|{scheduler_output.total_num_scheduled_tokens=}|{scheduler_output=}')
+            # For D case, wait until kv finish load here
             return self.kv_connector_no_forward(scheduler_output)
 
         # If necessary, swap decodes/prompts to have all decodes on the start
@@ -1622,9 +1646,17 @@ def execute_model(
         prefill_sampled_requests = []
         decode_sampled_token_ids = []
         decode_sampled_requests = []
+        #if not has_kv_transfer_group():
+            #assert not (num_prefills > 0 and num_decodes > 0)
+        with set_forward_context(None, self.vllm_config):
+            self.maybe_setup_kv_connector(scheduler_output)
+        finished_sending, finished_recving = set(), set()
+        token_ids_s = None
         ######################### PREFILLS #########################
         if num_prefills > 0:
+
             htorch.core.mark_step()
+
             for idx, (req_id, prompt_len, token_ids, position_ids,
                       attn_metadata, logits_indices,
                       logits_requests) in enumerate(
@@ -1632,14 +1664,14 @@ def execute_model(
                 self.event_start = self.profiler.get_timestamp_us()
                 self.profiler.start("internal", "prefill")
                 htorch.core.mark_step()
-                self.maybe_setup_kv_connector(scheduler_output)
+                token_ids_s = token_ids.shape
                 prefill_hidden_states_ts, logits_device = \
                     self._execute_model_generic(
                         token_ids, position_ids, attn_metadata, logits_indices,
                         self.kv_caches)
-                htorch.core.mark_step()
-                finished_sending, finished_recving = (
-                    self.get_finished_kv_transfers(scheduler_output))
+                if self.do_mark_step:
+                    htorch.core.mark_step()
+                #logger.info(f'libin debug done prompt {os.getenv('RANK')} {token_ids.shape=} ')
                 with self.profiler.record_event('internal', "sampler"):
                     sampling_metadata = self._prepare_sampling(
                         batch_changed, req_id, pad_to=logits_device.shape[0])
@@ -1663,25 +1695,25 @@ def execute_model(
                         prompt_batch_idx=idx,
                         is_prompt=True)
                     self.profiler.record_counter(self.event_start, counters)
+            self.maybe_wait_for_kv_save(scheduler_output.scheduled_new_reqs)
             if self.is_driver_worker and self.profiler.enabled:
                 self.profiler_counter_helper.reset_prompt_seq_stats()
 
         ######################### DECODES #########################
         # Decodes run as one single batch with [padded_decode_bs, 1]
         if num_decodes > 0:
+            token_ids_s = decode_data.token_ids.shape
             self.event_start = self.profiler.get_timestamp_us()
             self.profiler.start("internal", "decode")
             assert decode_data is not None
             htorch.core.mark_step()
-            self.maybe_setup_kv_connector(scheduler_output)
             _, logits_device = \
                 self._execute_model_generic(
                 decode_data.token_ids, decode_data.position_ids,
                 decode_data.attn_metadata, decode_data.logits_indices,
-                self.kv_caches)
+                self.kv_caches, scheduler_output=scheduler_output)
             htorch.core.mark_step()
-            finished_sending, finished_recving = (
-                self.get_finished_kv_transfers(scheduler_output))
+
             with self.profiler.record_event('internal', "sampler"):
                 sampling_metadata = self._prepare_sampling(
                     batch_changed,
@@ -1758,7 +1790,8 @@ def execute_model(
         prompt_logprobs_dict: dict[str, Optional[LogprobsTensors]] = {}
         all_req_ids = pd_info.decode_req_ids + pd_info.prompt_req_ids
         logprobs = None
-
+        finished_sending, finished_recving = (
+                self.get_finished_kv_transfers(scheduler_output))
         model_runner_output = ModelRunnerOutput(
             req_ids=all_req_ids,
             req_id_to_index=self.input_batch.req_id_to_index,
@@ -1769,9 +1802,60 @@ def execute_model(
             finished_sending=finished_sending,
             finished_recving=finished_recving,
         )
+        #logger.debug(f"buke hpu_model_runner.py: {model_runner_output=}")
         if has_kv_transfer_group():
             get_kv_transfer_group().clear_connector_metadata()
+        s2= time.perf_counter()
+        # if token_ids_s and num_prefills > 0:
+            # logger.info(f"libin debug execute_model prompt {os.getenv('RANK')} {token_ids_s=} step time:{s2-s1}")
         return model_runner_output
+    def kv_connector_no_forward(
+            self, scheduler_output: "SchedulerOutput") -> ModelRunnerOutput:
+        # KV send/recv even if no work to do.
+        with set_forward_context(None, self.vllm_config):
+            self.maybe_setup_kv_connector(scheduler_output)
+            finished_sending, finished_recving = (
+                self.get_finished_kv_transfers(scheduler_output))
+
+        if not finished_sending and not finished_recving:
+            return EMPTY_MODEL_RUNNER_OUTPUT
+
+        output = copy.copy(EMPTY_MODEL_RUNNER_OUTPUT)
+        output.finished_sending = finished_sending
+        output.finished_recving = finished_recving
+        return output
+
+    @staticmethod
+    def maybe_setup_kv_connector(scheduler_output: "SchedulerOutput"):
+        # Update KVConnector with the KVConnector metadata forward().
+        if has_kv_transfer_group():
+            kv_connector = get_kv_transfer_group()
+            assert isinstance(kv_connector, KVConnectorBase_V1)
+            assert scheduler_output.kv_connector_metadata is not None
+            kv_connector.bind_connector_metadata(
+                scheduler_output.kv_connector_metadata)
+
+            # Background KV cache transfers happen here.
+            # These transfers are designed to be async and the requests
+            # involved may be disjoint from the running requests.
+            # Do this here to save a collective_rpc.
+            #logger.debug(f'buke maybe_setup_kv_connector: {scheduler_output=}')
+            kv_connector.start_load_kv(scheduler_output.kv_connector_metadata)
+
+    @staticmethod
+    def maybe_wait_for_kv_save(req: Optional[NewRequestData]) -> None:
+        if has_kv_transfer_group():
+            get_kv_transfer_group().wait_for_save()
+
+    # @staticmethod
+    # def get_finished_kv_transfers(
+    #     scheduler_output: "SchedulerOutput",
+    # ) -> tuple[Optional[set[str]], Optional[set[str]]]:
+    #     if has_kv_transfer_group():
+    #         return get_kv_transfer_group().get_finished(
+    #             scheduler_output)
+    #     return None, None
+
 
     def load_model(self) -> None:
         import habana_frameworks.torch.core as htcore
@@ -1810,11 +1894,12 @@ def load_model(self) -> None:
         hidden_layer_markstep_interval = int(
             os.getenv('VLLM_CONFIG_HIDDEN_LAYERS', '1'))
         model_config = getattr(self.model, "config", None)
-        modify_model_layers(
-            self.model,
-            get_target_layer_suffix_list(
-                model_config.model_type if model_config is not None else None),
-            hidden_layer_markstep_interval)
+        if self.do_mark_step:
+            modify_model_layers(
+                self.model,
+                get_target_layer_suffix_list(
+                    model_config.model_type if model_config is not None else None),
+                hidden_layer_markstep_interval)
         torch.hpu.synchronize()
 
         with HabanaMemoryProfiler() as m:  # noqa: SIM117
@@ -2208,6 +2293,7 @@ def warmup_model(self) -> None:
         if prompt_profile_cfg or decode_profile_cfg:
             self._generate_profiling(prompt_profile_cfg, decode_profile_cfg)
             raise AssertionError("Finished profiling")
+        #self.bucketing_ctx.generate_prompt_buckets()
         kv_caches = self.kv_caches
         self.bucketing_manager.generate_prompt_buckets()
         self.bucketing_manager.generate_decode_buckets()
@@ -2315,6 +2401,12 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
                 "Hybrid models with more than one KV cache type are not "
                 "supported yet.")
 
+        # build a map from layer_name -> KVCacheTensor
+        tensor_map: dict[str, KVCacheTensor] = {}
+        for tensor in kv_cache_config.kv_cache_tensors:
+            for lname in tensor.shared_by:
+                tensor_map[lname] = tensor
+
         kv_caches: dict[str, torch.Tensor] = {}
         kv_cache_sizes = {}
         for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
@@ -2342,6 +2434,7 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
                     v_cache_shape = None if self.model_config.use_mla \
                     else kv_cache_shape
                     dtype = kv_cache_spec.dtype
+                    #logger.debug(f'buke: |{os.getpid()=}|{kv_cache_shape=}')
                     key_cache = torch.zeros(kv_cache_shape,
                                             dtype=dtype,
                                             device=self.device)
@@ -2352,6 +2445,7 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
                     else:
                         value_cache = None
                     kv_caches[layer_name] = (key_cache, value_cache)
+                    #logger.debug(f"buke initialize_kv_cache: {key_cache.data_ptr()=}|{value_cache.data_ptr()=}")
                 else:
                     # TODO: add new branches when introducing more types of
                     # KV cache specs.
@@ -2367,17 +2461,28 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
         self._PAD_BLOCK_ID = num_blocks
         self._PAD_SLOT_ID = num_blocks * self.block_size
 
+        if has_kv_transfer_group():
+            #kv_caches = { layer: torch.stack((tup[0], tup[1])) for layer,tup in kv_caches.items()}
+            get_kv_transfer_group().register_kv_caches(kv_caches)
+            if get_kv_transfer_group().connector_worker.kv_buffer_device == "cpu":
+                get_kv_transfer_group().set_host_xfer_buffer_ops(copy_kv_blocks)
+            global hpu_buffer
+            #if hpu_buffer is None:
+            #    _, num_kv_heads, head_size = kv_cache_shape
+            #    shape =[len(kv_caches), 2, 8192  , num_kv_heads, head_size]
+            #    hpu_buffer = torch.empty(shape, dtype=kv_cache_spec.dtype, device=self.device)
+
         htorch.hpu.synchronize()
 
-    @staticmethod
-    def maybe_setup_kv_connector(scheduler_output: "SchedulerOutput"):
-        # Update KVConnector with the KVConnector metadata forward().
-        if has_kv_transfer_group():
-            kv_connector = get_kv_transfer_group()
-            assert isinstance(kv_connector, KVConnectorBase_V1)
-            assert scheduler_output.kv_connector_metadata is not None
-            kv_connector.bind_connector_metadata(
-                scheduler_output.kv_connector_metadata)
+    # @staticmethod
+    # def maybe_setup_kv_connector(scheduler_output: "SchedulerOutput"):
+    #     # Update KVConnector with the KVConnector metadata forward().
+    #     if has_kv_transfer_group():
+    #         kv_connector = get_kv_transfer_group()
+    #         assert isinstance(kv_connector, KVConnectorBase_V1)
+    #         assert scheduler_output.kv_connector_metadata is not None
+    #         kv_connector.bind_connector_metadata(
+    #             scheduler_output.kv_connector_metadata)
 
     @staticmethod
     def get_finished_kv_transfers(
@@ -2388,16 +2493,16 @@ def get_finished_kv_transfers(
                 scheduler_output.finished_req_ids)
         return None, None
 
-    def kv_connector_no_forward(
-            self, scheduler_output: "SchedulerOutput") -> ModelRunnerOutput:
-        # KV send/recv even if no work to do.
-        with set_forward_context(None, self.vllm_config):
-            self.maybe_setup_kv_connector(scheduler_output)
-            if has_kv_transfer_group():
-                kv_connector = get_kv_transfer_group()
-                kv_connector.start_load_kv(get_forward_context())
-            finished_sending, finished_recving = (
-                self.get_finished_kv_transfers(scheduler_output))
+    # def kv_connector_no_forward(
+    #         self, scheduler_output: "SchedulerOutput") -> ModelRunnerOutput:
+    #     # KV send/recv even if no work to do.
+    #     with set_forward_context(None, self.vllm_config):
+    #         self.maybe_setup_kv_connector(scheduler_output)
+    #         if has_kv_transfer_group():
+    #             kv_connector = get_kv_transfer_group()
+    #             kv_connector.start_load_kv(get_forward_context())
+    #         finished_sending, finished_recving = (
+    #             self.get_finished_kv_transfers(scheduler_output))
 
         if not finished_sending and not finished_recving:
             return EMPTY_MODEL_RUNNER_OUTPUT
@@ -2406,3 +2511,94 @@ def kv_connector_no_forward(
         output.finished_sending = finished_sending
         output.finished_recving = finished_recving
         return output
+
+
+def _make_src_and_dst_indices(
+    block_size: int,
+    src_block_ids: list[int],
+    dst_block_ids: list[int],
+    src_device: Union[torch.device, str],
+    dst_device: Union[torch.device, str],
+) -> tuple[torch.Tensor, torch.Tensor]:
+    #convert to slot mapping
+    src_slot_mapping = np.concat([np.arange(start=s*block_size, stop=(s+1)*block_size) for s in src_block_ids])
+    dst_slot_mapping = np.concat([np.arange(start=d*block_size, stop=(d+1)*block_size) for d in dst_block_ids])
+
+    src_slot_mapping = torch.tensor(src_slot_mapping,
+                               device=src_device,
+                               dtype=torch.int64)
+    dst_slot_mapping = torch.tensor(dst_slot_mapping,
+                               device=dst_device,
+                               dtype=torch.int64)
+    return src_slot_mapping, dst_slot_mapping
+
+def copy_kv_blocks(
+    block_size: int,
+    src_kv_caches: dict[str, torch.Tensor],
+    dst_kv_caches: dict[str, torch.Tensor],
+    src_block_ids: list[int],
+    dst_block_ids: list[int],
+    direction: Literal["h2d", "d2h"],
+) -> None:
+
+    """Copy kv blocks between different buffers."""
+    if not src_kv_caches or not dst_kv_caches or \
+       not src_block_ids or not dst_block_ids or \
+       len(src_block_ids) != len(dst_block_ids):
+        return
+    assert len(src_block_ids) == len(dst_block_ids)
+    src_device = next(iter(src_kv_caches.values()))[0].device
+    dst_device = next(iter(dst_kv_caches.values()))[0].device
+
+    src_slot_mapping, dst_slot_mapping = _make_src_and_dst_indices(
+        block_size=block_size,
+        src_block_ids=src_block_ids,
+        dst_block_ids=dst_block_ids,
+        src_device=src_device,
+        dst_device=dst_device)
+
+    start = time.perf_counter()
+    target_device = dst_device.type
+
+    i = 0
+    global hpu_buffer, is_hetero, block_factor
+    use_hpu_buffer = False # (len(src_slot_mapping) == hpu_buffer[0][0].size(0)) and (hpu_buffer is not None)
+    for layer_name in src_kv_caches:
+        key_cache = src_kv_caches[layer_name][0]
+        value_cache = src_kv_caches[layer_name][1]
+
+        if is_hetero:
+            assert direction == "h2d", "hetero only supports h2d for now"
+            n_kv_heads, head_dim = key_cache.shape[-2:]
+            remote_block_size = block_size//block_factor
+            # block_factor, n_kv_heads, remote_block_size, head_dim = 8, 8, 16, 128
+            if len(src_block_ids) == src_block_ids[-1]-src_block_ids[0] + 1: # simple check if the indices are contiguous
+                block_idx = src_block_ids[0]
+                num_blocks = len(src_block_ids)
+                dst_kv_caches[layer_name][0][block_idx*block_size: (num_blocks+block_idx)*block_size] = key_cache[block_idx*block_size: (num_blocks+block_idx)*block_size].reshape(num_blocks*block_factor, n_kv_heads, remote_block_size, head_dim).permute(0,2,1,3).contiguous().reshape(num_blocks*block_size,n_kv_heads,head_dim)
+                dst_kv_caches[layer_name][1][block_idx*block_size: (num_blocks+block_idx)*block_size] = value_cache[block_idx*block_size: (num_blocks+block_idx)*block_size].reshape(num_blocks*block_factor, n_kv_heads, remote_block_size, head_dim).permute(0,2,1,3).contiguous().reshape(num_blocks*block_size,n_kv_heads,head_dim)
+                continue
+            for block_idx in src_block_ids:
+                #print('buke addr before:', dst_kv_caches[layer_name][0][block_idx*block_size: (1+block_idx)*block_size].data_ptr())
+                dst_kv_caches[layer_name][0][block_idx*block_size: (1+block_idx)*block_size] = key_cache[block_idx*block_size: (1+block_idx)*block_size].reshape(block_factor, n_kv_heads, remote_block_size, head_dim).permute(0,2,1,3).contiguous().reshape(block_size,n_kv_heads,head_dim).to("hpu")
+                dst_kv_caches[layer_name][1][block_idx*block_size: (1+block_idx)*block_size] = value_cache[block_idx*block_size: (1+block_idx)*block_size].reshape(block_factor, n_kv_heads, remote_block_size, head_dim).permute(0,2,1,3).contiguous().reshape(block_size,n_kv_heads,head_dim).to("hpu")
+                #print('buke addr after:', dst_kv_caches[layer_name][0][block_idx*block_size: (1+block_idx)*block_size].data_ptr())
+        else:
+            if direction == "d2h" and use_hpu_buffer:
+                hpu_buffer[i][0]=key_cache.index_select_(0,  src_slot_mapping)
+                hpu_buffer[i][1]=value_cache.index_select_(0,  src_slot_mapping)
+            else:
+                #import remote_pdb;remote_pdb.set_trace()
+                dst_kv_caches[layer_name][0].index_put_((dst_slot_mapping,), key_cache.index_select(0, src_slot_mapping).to(target_device))
+                dst_kv_caches[layer_name][1].index_put_((dst_slot_mapping,), value_cache.index_select(0, src_slot_mapping).to(target_device))
+        i = i+1
+
+        #dst_kv_caches[layer_name][0][dst_slot_mapping] = key_cache[src_slot_mapping].to(target_device)
+        #dst_kv_caches[layer_name][1][dst_slot_mapping] = value_cache[src_slot_mapping].to(target_device)
+    #if use_hpu_buffer:
+        #tmp = hpu_buffer.to('cpu')
+        #dst_kv_caches = hpu_buffer.to('cpu')
+
+    torch.hpu.synchronize()
+
+    logger.info(f"copy_kv_blocks: copy takes {time.perf_counter() - start}|{direction=}|{os.getpid()=}|{block_size=}|{len(src_block_ids)=}|{len(dst_block_ids)=}| {len(src_kv_caches)=} | ")