vllm-project
diff --git a/‎docs/design/pd_server_patch_guide.md‎
Lines changed: 0 additions & 185 deletions b/‎docs/design/pd_server_patch_guide.md‎
Lines changed: 0 additions & 185 deletions
diff --git a/‎vllm_omni/core/sched/omni_ar_scheduler.py‎
Lines changed: 0 additions & 23 deletions b/‎vllm_omni/core/sched/omni_ar_scheduler.py‎
Lines changed: 0 additions & 23 deletions
diff --git a/‎vllm_omni/core/sched/omni_generation_scheduler.py‎
Lines changed: 0 additions & 23 deletions b/‎vllm_omni/core/sched/omni_generation_scheduler.py‎
Lines changed: 0 additions & 23 deletions
diff --git a/‎vllm_omni/distributed/kv_transfer/patched_mooncake_connector.py‎
Lines changed: 41 additions & 22 deletions b/‎vllm_omni/distributed/kv_transfer/patched_mooncake_connector.py‎
Lines changed: 41 additions & 22 deletions
diff --git a/‎vllm_omni/entrypoints/async_omni.py‎
Lines changed: 13 additions & 2 deletions b/‎vllm_omni/entrypoints/async_omni.py‎
Lines changed: 13 additions & 2 deletions
@@ -68,29 +68,6 @@ def __init__(self, *args, **kwargs):
         if getattr(model_config, "async_chunk", False):
             self.chunk_transfer_adapter = OmniChunkTransferAdapter(self.vllm_config)
 
-    def _get_routed_experts(self, request: Request):
-        """Return routed-experts array for *request*, or ``None``.
-
-        Delegates to the parent ``Scheduler`` when it provides this method
-        (vLLM >= 0.9); otherwise returns ``None`` so that older vLLM
-        installations don't crash.
-        """
-        parent = getattr(super(), "_get_routed_experts", None)
-        if parent is not None:
-            return parent(request)
-        return None
-
-    def _handle_stopped_request(self, request: Request) -> bool:
-        """Handle a stopped request — returns ``True`` when truly finished.
-
-        Delegates to the parent ``Scheduler`` when it provides this method
-        (vLLM >= 0.9); otherwise falls back to checking the request status.
-        """
-        parent = getattr(super(), "_handle_stopped_request", None)
-        if parent is not None:
-            return parent(request)
-        return request.status.is_finished
-
     def _get_kv_transfer_criteria(self) -> dict | None:
         # Note: vllm_config is available in Scheduler after super().__init__
         if not hasattr(self, "vllm_config"):
 
@@ -30,29 +30,6 @@ def __init__(self, *args, **kwargs):
         if getattr(model_config, "async_chunk", False):
             self.chunk_transfer_adapter = OmniChunkTransferAdapter(self.vllm_config)
 
-    def _get_routed_experts(self, request: Request):
-        """Return routed-experts array for *request*, or ``None``.
-
-        Delegates to the parent ``Scheduler`` when it provides this method
-        (vLLM >= 0.9); otherwise returns ``None`` so that older vLLM
-        installations don't crash.
-        """
-        parent = getattr(super(), "_get_routed_experts", None)
-        if parent is not None:
-            return parent(request)
-        return None
-
-    def _handle_stopped_request(self, request: Request) -> bool:
-        """Handle a stopped request — returns ``True`` when truly finished.
-
-        Delegates to the parent ``Scheduler`` when it provides this method
-        (vLLM >= 0.9); otherwise falls back to checking the request status.
-        """
-        parent = getattr(super(), "_handle_stopped_request", None)
-        if parent is not None:
-            return parent(request)
-        return request.status.is_finished
-
     def schedule(self) -> SchedulerOutput:
         """Diffusion fast path:
         - Feed all input tokens of the request at once
 
@@ -149,13 +149,22 @@ def add_new_req(
             kv_transfer_params: dict[str, Any] | None = None,
             **kwargs: Any,
         ) -> None:
-            """Override to store a ``PatchedRecvReqMeta`` that remembers the
-            prefill engine's ``remote_request_id``.
+            """Call ``super().add_new_req()`` for all requests, then layer
+            PD-specific ``PatchedRecvReqMeta`` on top for decode-side
+            (``load_remote_cache=True``) requests.
 
-            When ``kv_transfer_params`` contains ``"remote_request_id"``, we
-            use it for the ZMQ look-up key.  Otherwise we fall back to the
-            local ``request_id`` (original behaviour).
+            This ensures any future logic added to the base method is
+            always executed, while still providing the
+            ``remote_request_id`` mapping needed for PD disaggregation.
             """
+            # Always call super() so base-class bookkeeping is preserved.
+            super().add_new_req(
+                request_id,
+                local_block_ids,
+                kv_transfer_params,
+                **kwargs,
+            )
+
             kv_transfer_params = kv_transfer_params or {}
             load_remote_cache = kv_transfer_params.get(
                 "do_remote_prefill",
@@ -172,7 +181,8 @@ def add_new_req(
                     local_block_ids=local_block_ids,
                     kv_transfer_params=kv_transfer_params,
                 )
-                # Store in the same structure the base class uses
+                # Override the entry created by super() with our patched
+                # version that carries remote_request_id.
                 if not hasattr(self, "_reqs_need_recv"):
                     self._reqs_need_recv = {}
                 self._reqs_need_recv[request_id] = meta
@@ -183,27 +193,25 @@ def add_new_req(
                     remote_request_id,
                     self.engine_id,
                 )
-            else:
-                # Producer side — delegate to original
-                super().add_new_req(
-                    request_id,
-                    local_block_ids,
-                    kv_transfer_params,
-                    **kwargs,
-                )
 
         def group_kv_pull(self, metadata: Any | None = None) -> None:
             """Override to use ``meta.remote_request_id`` as the ZMQ look-up
             key instead of the local request ID.
 
-            After issuing the pull, we record the remote→local mapping in
-            ``self.remote_to_local_req`` so ``receive_kv`` can translate
-            back.
+            We build a patched copy of ``_reqs_need_recv`` with
+            ``remote_request_id`` as the key so the base class ZMQ logic
+            looks up the correct remote KV cache.  The original dict is
+            restored after ``super().group_kv_pull()`` returns to avoid
+            confusing the base class with unexpected mutations.
             """
             if not hasattr(self, "_reqs_need_recv") or not self._reqs_need_recv:
                 return
 
-            for local_id, meta in list(self._reqs_need_recv.items()):
+            # Build a patched copy; keep the original for restoration.
+            original_recv = self._reqs_need_recv.copy()
+            patched_recv: dict[str, Any] = {}
+
+            for local_id, meta in original_recv.items():
                 if isinstance(meta, PatchedRecvReqMeta):
                     remote_id = meta.remote_request_id
                     self.remote_to_local_req[remote_id] = local_id
@@ -213,19 +221,30 @@ def group_kv_pull(self, metadata: Any | None = None) -> None:
                         remote_id,
                         local_id,
                     )
-                    # Replace with a fake meta that uses remote_id as request_id
-                    # so the base class ZMQ logic uses remote_id to look up KV
+                    # Use remote_id as key so the base class ZMQ logic
+                    # looks up KV under the prefill engine's request ID.
                     patched_meta = type(meta)(
                         request_id=remote_id,
                         remote_request_id=remote_id,
                         local_block_ids=meta.local_block_ids,
                         kv_transfer_params=meta.kv_transfer_params,
                     )
-                    self._reqs_need_recv[local_id] = patched_meta
+                    patched_recv[remote_id] = patched_meta
+                else:
+                    patched_recv[local_id] = meta
 
-            # Delegate the actual ZMQ transfer to the base class
+            # Swap in the patched dict, delegate to the base class, then
+            # restore entries that weren't consumed.
+            self._reqs_need_recv = patched_recv
             super().group_kv_pull(metadata)
 
+            # Restore any entries that the base class didn't consume
+            # (e.g. still pending transfer) back to their original keys.
+            for remote_id, local_id in list(self.remote_to_local_req.items()):
+                if remote_id in self._reqs_need_recv:
+                    entry = self._reqs_need_recv.pop(remote_id)
+                    self._reqs_need_recv[local_id] = original_recv.get(local_id, entry)
+
         def receive_kv(self, path: Any = None, req_blocks: Any = None) -> Any:
             """After the base class completes the ZMQ transfer, map
             ``remote_id`` back to ``local_id`` in any result structures.
 
@@ -518,18 +518,29 @@ async def _process_sequential_results(
                         "transfer_id": f"xfer-{request_id}",
                     }
 
+                    # Merge any user-provided decode-side kv_transfer_params
+                    # first (same semantics as the sync path in omni.py).
+                    existing_kv_params = self._normalize_kv_transfer_params(
+                        sp_next.extra_args.get("kv_transfer_params")
+                    )
+                    if existing_kv_params:
+                        decode_kv_params.update(existing_kv_params)
+
+                    # Add prefill engine connection info from config
+                    # (only fill in keys that aren't already present).
                     if self._pd_connector_info:
                         eid = self._pd_connector_info.get("prefill_engine_id")
-                        if eid is not None:
+                        if eid is not None and "remote_engine_id" not in decode_kv_params:
                             decode_kv_params["remote_engine_id"] = eid
                         baddr = self._pd_connector_info.get("prefill_bootstrap_addr")
-                        if baddr is not None:
+                        if baddr is not None and "remote_bootstrap_addr" not in decode_kv_params:
                             decode_kv_params["remote_bootstrap_addr"] = baddr
 
                     kv_from_prefill = self._extract_kv_transfer_params(engine_outputs)
                     if kv_from_prefill:
                         decode_kv_params.update(kv_from_prefill)
 
+                    # Ensure the decode role flags are correct after merges
                     decode_kv_params["do_remote_prefill"] = True
                     decode_kv_params["do_remote_decode"] = False