[0.9.1-DEV][BUGFIX] BugFix: Resolve the issue of waiting queue accumulation when requests are canceled. (#2502)

zouyida2052 · zouyida2002 · web-flow · commit 81f3b9c7e62b · 2025-08-23T19:36:25.000+08:00
### What this PR does / why we need it?
Resolve the issue of waiting queue accumulation when requests are
canceled.

### Does this PR introduce _any_ user-facing change?
no

### How was this patch tested?
by ci.

---------

Signed-off-by: zouyida &lt;zouyida@huawei.com&gt;
Co-authored-by: zouyida &lt;zouyida@huawei.com&gt;
diff --git a/docs/source/user_guide/release_notes.md b/docs/source/user_guide/release_notes.md
@@ -19,7 +19,7 @@ This is the 3rd release candidate of v0.9.1 for vLLM Ascend. Please follow the [
 - Fix header include issue in rope [#2398](https://github.com/vllm-project/vllm-ascend/pull/2398)
 - Fix mtp config bug [#2412](https://github.com/vllm-project/vllm-ascend/pull/2412)
 - Fix error info and adapt `attn_metedata` refactor [#2402](https://github.com/vllm-project/vllm-ascend/pull/2402)
-- Fix torchair runtime errror caused by configuration mismtaches and `.kv_cache_bytes` file missing [#2312](https://github.com/vllm-project/vllm-ascend/pull/2312)
+- Fix torchair runtime error caused by configuration mismtaches and `.kv_cache_bytes` file missing [#2312](https://github.com/vllm-project/vllm-ascend/pull/2312)
 - Move `with_prefill` allreduce from cpu to npu [#2230](https://github.com/vllm-project/vllm-ascend/pull/2230)
 
 ### Docs
diff --git a/examples/disaggregate_prefill_v1/README.md b/examples/disaggregate_prefill_v1/README.md
@@ -205,7 +205,7 @@ vllm serve /models/deepseek_r1_w8a8 \
 Run proxy server on the first node:
 ```shell
 cd /vllm-workspace/vllm-ascend/examples/disaggregate_prefill_v1
-python toy_proxy_server.py --host 172.19.32.175 --port 1025 --prefiller-hosts 172.19.241.49 --prefiller-port 20002 --decoder-hosts 172.19.123.51 --decoder-ports 20002
+python load_balance_proxy_server_example.py --host 172.19.32.175 --port 1025 --prefiller-hosts 172.19.32.175 --prefiller-port 20002 --decoder-hosts 172.19.123.51 --decoder-ports 20002
 ```
 
 Verification
diff --git a/examples/disaggregate_prefill_v1/load_balance_proxy_server_example.py b/examples/disaggregate_prefill_v1/load_balance_proxy_server_example.py
@@ -4,9 +4,11 @@
 
 import argparse
 import asyncio
+import functools
 import heapq
 import os
 import sys
+import uuid
 from contextlib import asynccontextmanager
 from typing import List
 
@@ -54,7 +56,6 @@ def __init__(self, prefiller_instances, decoder_instances):
         ]
         self.req_to_prefiller = {}
         self.req_id_lock = asyncio.Lock()
-        self.req_id_counter = 0
         # Removed selection locks - no longer needed for synchronous methods
 
         # Initialize priority queues for efficient server selection
@@ -110,8 +111,7 @@ def aquire_aborted_prefiller_requests(
 
     async def next_req_id(self):
         async with self.req_id_lock:
-            self.req_id_counter += 1
-            return str(self.req_id_counter)
+            return str(uuid.uuid4())
 
     def select_prefiller(self, token_count):  # Changed to synchronous
         # No lock needed - entire function is atomic
@@ -230,6 +230,32 @@ async def lifespan(app: FastAPI):
         await d.client.aclose()
 
 
+async def listen_for_disconnect(request: Request) -> None:
+    """Return if a disconnect message is received"""
+    while True:
+        message = await request.receive()
+        if message["type"] == "http.disconnect":
+            break
+
+
+def with_cancellation(handler_func):
+
+    @functools.wraps(handler_func)
+    async def wrapper(*args, **kwargs):
+        request = kwargs["request"]
+        handler_task = asyncio.create_task(handler_func(*args, **kwargs))
+        cancellation_task = asyncio.create_task(listen_for_disconnect(request))
+        done, pending = await asyncio.wait([handler_task, cancellation_task],
+                                           return_when=asyncio.FIRST_COMPLETED)
+        for task in pending:
+            task.cancel()
+        if handler_task in done:
+            return handler_task.result()
+        return None
+
+    return wrapper
+
+
 app = FastAPI(lifespan=lifespan)
 
 
@@ -410,11 +436,13 @@ async def generate_stream():
 
 
 @app.post("/v1/completions")
+@with_cancellation
 async def handle_completions(request: Request):
     return await _handle_completions("/completions", request)
 
 
 @app.post("/v1/chat/completions")
+@with_cancellation
 async def handle_chat_completions(request: Request):
     return await _handle_completions("/chat/completions", request)
 
diff --git a/vllm_ascend/distributed/llmdatadist_c_mgr_connector.py b/vllm_ascend/distributed/llmdatadist_c_mgr_connector.py
@@ -884,7 +884,7 @@ def get_finished(
                 if now < expires:
                     break
                 logger.warning(
-                    "Some requests in prefill node fail to receive KV Cache transfer done signal. "
+                    f"Some requests in prefill node fail to receive KV Cache transfer done signal in {envs.VLLM_LLMDD_ABORT_REQUEST_TIMEOUT}s. "
                     "If a greater mean TTFT is acceptable, you can 'export VLLM_LLMDD_ABORT_REQUEST_TIMEOUT=600' (10 minutes) to relax the timeout condition. "
                 )
                 if req_id in self.reqs_to_send:
diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py
@@ -138,7 +138,7 @@
     # `VLLM_LLMDD_ABORT_REQUEST_TIMEOUT` is only applicable when using LLMDataDistCMgrConnector in a
     # disaggregated decode-prefill setup.
     "VLLM_LLMDD_ABORT_REQUEST_TIMEOUT":
-    lambda: int(os.getenv("VLLM_LLMDD_ABORT_REQUEST_TIMEOUT", 300)),
+    lambda: int(os.getenv("VLLM_LLMDD_ABORT_REQUEST_TIMEOUT", 120)),
     # Whether to enable mla_pa for deepseek mla decode, this flag will be removed after its available torch_npu is public accessible
     # and the mla_pa will be the default path of deepseek decode path.
     "VLLM_ASCEND_MLA_PA":
diff --git a/vllm_ascend/patch/platform/__init__.py b/vllm_ascend/patch/platform/__init__.py
@@ -14,12 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from vllm_ascend.utils import vllm_version_is
-
 # Import specific patches for different versions
-if vllm_version_is("0.9.1"):
-    from vllm_ascend.patch.platform import patch_0_9_1  # noqa: F401
-    from vllm_ascend.patch.platform import patch_common  # noqa: F401
-else:
-    from vllm_ascend.patch.platform import patch_common  # noqa: F401
-    from vllm_ascend.patch.platform import patch_main  # noqa: F401
+from vllm_ascend.patch.platform import patch_0_9_1  # noqa: F401
+from vllm_ascend.patch.platform import patch_common  # noqa: F401
diff --git a/vllm_ascend/patch/platform/patch_0_9_1/patch_core.py b/vllm_ascend/patch/platform/patch_0_9_1/patch_core.py
@@ -1,12 +1,16 @@
 import os
 import signal
-from typing import Optional
+import types
+from collections.abc import Iterable
+from typing import Optional, Union
 
 from vllm.config import ParallelConfig, VllmConfig
 from vllm.logger import init_logger
 from vllm.transformers_utils.config import \
     maybe_register_config_serialize_by_value
 from vllm.v1.engine.core import DPEngineCoreProc, EngineCoreProc
+from vllm.v1.outputs import ModelRunnerOutput
+from vllm.v1.request import RequestStatus
 
 import vllm_ascend.envs as vllm_ascend_envs
 
@@ -77,7 +81,10 @@ def run_busy_loop(self):
                 self.execute_dummy_batch()
 
 
-def run_engine_core(*args, dp_rank: int = 0, local_dp_rank: int = 0, **kwargs):
+def run_engine_core_dplb(*args,
+                         dp_rank: int = 0,
+                         local_dp_rank: int = 0,
+                         **kwargs):
     """Launch EngineCore busy loop in background process."""
 
     # Signal handler used for graceful termination.
@@ -108,7 +115,115 @@ def signal_handler(signum, frame):
             engine_core = ExternealDPEngineCoreProc(*args, **kwargs)
         else:
             engine_core = EngineCoreProc(*args, **kwargs)
+        engine_core.scheduler.finish_requests = types.MethodType(
+            finish_requests, engine_core.scheduler)
+        engine_core.scheduler._update_from_kv_xfer_finished = types.MethodType(
+            _update_from_kv_xfer_finished, engine_core.scheduler)
+        engine_core.run_busy_loop()
+
+    except SystemExit:
+        logger.debug("EngineCore exiting.")
+        raise
+    except Exception as e:
+        if engine_core is None:
+            logger.exception("EngineCore failed to start.")
+        else:
+            logger.exception("EngineCore encountered a fatal error.")
+            engine_core._send_engine_dead()
+        raise e
+    finally:
+        if engine_core is not None:
+            engine_core.shutdown()
+
+
+def finish_requests(
+    self,
+    request_ids: Union[str, Iterable[str]],
+    finished_status: RequestStatus,
+) -> None:
+    """Handles the finish signal from outside the scheduler.
+    For example, the API server can abort a request when the client
+    disconnects.
+    """
+    assert RequestStatus.is_finished(finished_status)
+    if isinstance(request_ids, str):
+        request_ids = (request_ids, )
+    else:
+        request_ids = set(request_ids)
+
+    for req_id in request_ids:
+        request = self.requests.get(req_id)
+        if request is None:
+            # Invalid request ID.
+            continue
+        if request in self.waiting or request in self.running:
+            if request.status == RequestStatus.RUNNING:
+                self.running.remove(request)
+            else:
+                self.waiting.remove(request)
+        request.status = finished_status
+        self._free_request(request)
+
+
+def _update_from_kv_xfer_finished(self,
+                                  model_runner_output: ModelRunnerOutput):
+    """
+    KV Connector: update the scheduler state based on the output.
+    The Worker side connectors add finished_recving and
+    finished_sending reqs to the output.
+    * if finished_sending: free the blocks
+    # if finished_recving: add to state so we can
+        scheduler the request during the next step.
+    """
+    # KV Connector:: update recv and send status from last step.
+    for req_id in (model_runner_output.finished_recving or ()):
+        logger.debug("Finished recving KV transfer for request %s", req_id)
+        self.finished_recving_kv_req_ids.add(req_id)
+    for req_id in (model_runner_output.finished_sending or ()):
+        logger.debug("Finished sending KV transfer for request %s", req_id)
+        if req_id in self.requests:
+            self._free_blocks(self.requests[req_id])
+        else:
+            logger.debug("cannot find the req_id it may have been aborted.%s",
+                         req_id)
+
+
+def run_engine_core(*args, dp_rank: int = 0, local_dp_rank: int = 0, **kwargs):
+    """Launch EngineCore busy loop in background process."""
+
+    # Signal handler used for graceful termination.
+    # SystemExit exception is only raised once to allow this and worker
+    # processes to terminate without error
+    shutdown_requested = False
+
+    # Ensure we can serialize transformer config after spawning
+    maybe_register_config_serialize_by_value()
+
+    def signal_handler(signum, frame):
+        nonlocal shutdown_requested
+        if not shutdown_requested:
+            shutdown_requested = True
+            raise SystemExit()
+
+    # Either SIGTERM or SIGINT will terminate the engine_core
+    signal.signal(signal.SIGTERM, signal_handler)
+    signal.signal(signal.SIGINT, signal_handler)
+
+    engine_core: Optional[EngineCoreProc] = None
+    try:
+        parallel_config: ParallelConfig = kwargs["vllm_config"].parallel_config
+        if parallel_config.data_parallel_size > 1 or dp_rank > 0:
+            # Set data parallel rank for this engine process.
+            parallel_config.data_parallel_rank = dp_rank
+            parallel_config.data_parallel_rank_local = local_dp_rank
+            engine_core = DPEngineCoreProc(*args, **kwargs)
+        else:
+            engine_core = EngineCoreProc(*args, **kwargs)
 
+        engine_core.scheduler.finish_requests = types.MethodType(
+            finish_requests, engine_core.scheduler)
+        engine_core.scheduler._update_from_kv_xfer_finished = types.MethodType(
+            _update_from_kv_xfer_finished, engine_core.scheduler)
         engine_core.run_busy_loop()
 
     except SystemExit:
@@ -129,4 +244,6 @@ def signal_handler(signum, frame):
 # Apply this patch only if the external data parallelism is enabled
 if vllm_ascend_envs.VLLM_ASCEND_EXTERNAL_DP_LB_ENABLED:
     # Patch the EngineCoreClient to use the custom make_async_mp_client
+    EngineCoreProc.run_engine_core = run_engine_core_dplb  # type: ignore[attr-defined]
+else:
     EngineCoreProc.run_engine_core = run_engine_core  # type: ignore[attr-defined]
diff --git a/vllm_ascend/patch/worker/__init__.py b/vllm_ascend/patch/worker/__init__.py
@@ -15,12 +15,6 @@
 # limitations under the License.
 #
 
-from vllm_ascend.utils import vllm_version_is
-
 # Import specific patches for different versions
-if vllm_version_is("0.9.1"):
-    from vllm_ascend.patch.worker import patch_0_9_1  # noqa: F401
-    from vllm_ascend.patch.worker import patch_common  # noqa: F401
-else:
-    from vllm_ascend.patch.worker import patch_common  # noqa: F401
-    from vllm_ascend.patch.worker import patch_main  # noqa: F401
+from vllm_ascend.patch.worker import patch_0_9_1  # noqa: F401
+from vllm_ascend.patch.worker import patch_common  # noqa: F401

Original file line number	Diff line number	Diff line change
`@@ -884,7 +884,7 @@ def get_finished(`
`884`	`884`	`if now < expires:`
`885`	`885`	`break`
`886`	`886`	`logger.warning(`
`887`		`- "Some requests in prefill node fail to receive KV Cache transfer done signal. "`
	`887`	`+ f"Some requests in prefill node fail to receive KV Cache transfer done signal in {envs.VLLM_LLMDD_ABORT_REQUEST_TIMEOUT}s. "`
`888`	`888`	`"If a greater mean TTFT is acceptable, you can 'export VLLM_LLMDD_ABORT_REQUEST_TIMEOUT=600' (10 minutes) to relax the timeout condition. "`
`889`	`889`	`)`
`890`	`890`	`if req_id in self.reqs_to_send:`