[Bugfix] Merge prefill + decode embeddings in thinker2talker for PD disaggregation

ahengljh · claude · ahengljh · commit 4a2a62cd280a · 2026-02-12T17:39:45.000+08:00
In PD mode the decode engine's multimodal_output only covers tokens it
computed (~9), but thinker_sequences has the full prompt + generated
tokens (~20). This misalignment caused the talker to map embeddings to
wrong token positions, producing garbled output.

Now thinker2talker detects PD mode via a preceding prefill stage and
merges prefill prompt embeddings with decode generated embeddings so
the talker receives the complete, correctly-aligned sequence. TTS
embeds also fall back to the prefill stage if missing from decode.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/vllm_omni/model_executor/stage_input_processors/qwen3_omni.py b/vllm_omni/model_executor/stage_input_processors/qwen3_omni.py
@@ -3,6 +3,7 @@
 # Copyright 2025 The Qwen team.
 """Stage input processor for Qwen3 Omni MoE: Thinker → Talker transition."""
 
+import logging
 from typing import Any
 
 import torch
@@ -12,6 +13,8 @@
 from vllm_omni.engine import OmniEngineCoreRequest
 from vllm_omni.inputs.data import OmniTokensPrompt
 
+logger = logging.getLogger(__name__)
+
 
 def _compute_talker_prompt_ids_length(info, device: torch.device | str = "cuda") -> int:
     im_start_token_id = 151644
@@ -141,6 +144,62 @@ def thinker2talker_async_chunk(
     return talker_additional_info
 
 
+def _get_prefill_stage(stage_list: list[Any], source_stage_id: int) -> Any | None:
+    """Return the preceding prefill stage if PD disaggregation is active."""
+    if source_stage_id <= 0:
+        return None
+    source_stage = stage_list[source_stage_id]
+    if not getattr(source_stage, "is_decode_only", False):
+        return None
+    prev_stage = stage_list[source_stage_id - 1]
+    if (
+        getattr(prev_stage, "is_prefill_only", False)
+        and prev_stage.engine_outputs is not None
+    ):
+        return prev_stage
+    return None
+
+
+def _merge_pd_embeddings(
+    decode_emb: torch.Tensor,
+    decode_hid: torch.Tensor,
+    prefill_mm: dict[str, Any],
+    device: torch.device,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Merge prefill prompt embeddings with decode generated embeddings.
+
+    In PD disaggregation the decode engine only produces embeddings for the
+    tokens it actually computed (1 remaining-prompt + N generated).  The
+    prefill engine has embeddings for the full prompt.  We concatenate them
+    so the talker sees the complete sequence::
+
+        merged = prefill[0 : prompt_len] + decode[1:]
+                         ^                        ^
+                    prompt positions       generated positions
+                                      (skip overlap at last prompt pos)
+    """
+    try:
+        p_emb = prefill_mm["0"].detach().to(device=device, dtype=torch.float)
+        p_hid = prefill_mm["24"].detach().to(device=device, dtype=torch.float)
+    except (KeyError, AttributeError, TypeError):
+        return decode_emb, decode_hid
+
+    if p_emb.shape[0] == 0 or decode_emb.shape[0] == 0:
+        return decode_emb, decode_hid
+
+    # decode[0] is the recomputed last-prompt-token (overlap with prefill[-1]).
+    merged_emb = torch.cat([p_emb, decode_emb[1:]], dim=0)
+    merged_hid = torch.cat([p_hid, decode_hid[1:]], dim=0)
+
+    logger.info(
+        "[PD] Merged prefill(%d) + decode(%d) → %d embeddings",
+        p_emb.shape[0],
+        decode_emb.shape[0],
+        merged_emb.shape[0],
+    )
+    return merged_emb, merged_hid
+
+
 def thinker2talker(
     stage_list: list[Any],
     engine_input_source: list[int],
@@ -155,6 +214,12 @@ def thinker2talker(
     2. Split hidden states into: prompt embeddings + generated embeddings
     3. Package for talker with additional information
 
+    In PD disaggregation the decode engine's multimodal_output only covers
+    the tokens it computed (not the full prompt).  When a preceding prefill
+    stage is detected we merge the prefill's prompt embeddings with the
+    decode's generated embeddings so the talker receives the complete
+    sequence.
+
     Args:
         stage_list: List of stage objects
         engine_input_source: Source stage IDs (typically [0] for thinker)
@@ -169,21 +234,55 @@ def thinker2talker(
 
     device = torch.device(current_platform.device_type)
 
+    # PD disaggregation: look for a preceding prefill stage whose
+    # embeddings we need to merge with the decode output.
+    source_stage_id = engine_input_source[0]
+    prefill_stage = _get_prefill_stage(stage_list, source_stage_id)
+
     # Process each thinker output
-    for thinker_output in thinker_outputs:
+    for i, thinker_output in enumerate(thinker_outputs):
         output = thinker_output.outputs[0]
 
+        decode_emb = output.multimodal_output["0"].detach().to(device=device, dtype=torch.float)
+        decode_hid = output.multimodal_output["24"].detach().to(device=device, dtype=torch.float)
+
+        # Merge prefill prompt embeddings when running in PD mode.
+        if prefill_stage is not None:
+            try:
+                prefill_eos = prefill_stage.engine_outputs
+                prefill_eo = prefill_eos[min(i, len(prefill_eos) - 1)]
+                prefill_mm = prefill_eo.outputs[0].multimodal_output
+                decode_emb, decode_hid = _merge_pd_embeddings(
+                    decode_emb, decode_hid, prefill_mm, device,
+                )
+            except Exception as exc:
+                logger.warning("[PD] Could not merge prefill embeddings: %s", exc)
+
+        # Helper: get TTS embed from decode, fall back to prefill if missing.
+        def _tts(key: str) -> torch.Tensor:
+            val = output.multimodal_output.get(key)
+            if val is None and prefill_stage is not None:
+                try:
+                    val = (
+                        prefill_stage.engine_outputs[0]
+                        .outputs[0]
+                        .multimodal_output.get(key)
+                    )
+                except Exception:
+                    pass
+            return val.detach().to(device=device, dtype=torch.float) if val is not None else None
+
         info = {
-            "thinker_embeddings": output.multimodal_output["0"].detach().to(device=device, dtype=torch.float),
-            "thinker_hidden_states": output.multimodal_output["24"].detach().to(device=device, dtype=torch.float),
+            "thinker_embeddings": decode_emb,
+            "thinker_hidden_states": decode_hid,
             "thinker_sequences": (
                 thinker_output.prompt_token_ids + output.token_ids
             ),  # the thinker_sequences is the whole ids
             "thinker_input_ids": thinker_output.prompt_token_ids,
             # Provide thinker-side TTS token embeddings for talker projection
-            "tts_bos_embed": output.multimodal_output["tts_bos_embed"].detach().to(device=device, dtype=torch.float),
-            "tts_eos_embed": output.multimodal_output["tts_eos_embed"].detach().to(device=device, dtype=torch.float),
-            "tts_pad_embed": output.multimodal_output["tts_pad_embed"].detach().to(device=device, dtype=torch.float),
+            "tts_bos_embed": _tts("tts_bos_embed"),
+            "tts_eos_embed": _tts("tts_eos_embed"),
+            "tts_pad_embed": _tts("tts_pad_embed"),
         }
 
         prompt_len = _compute_talker_prompt_ids_length(info, device=device)