[BugFix]: Fix bagel online inference bug (vllm-project#1804)

princepride · web-flow · commit e0b71056bad6 · 2026-03-13T08:58:44.000Z
Signed-off-by: princepride &lt;wangzhipeng628@gmail.com&gt;
diff --git a/tests/e2e/offline_inference/test_zimage_parallelism.py b/tests/e2e/offline_inference/test_zimage_parallelism.py
@@ -17,12 +17,14 @@
 from pathlib import Path
 
 import numpy as np
-import pytest
+
+# import pytest
 import torch
 from PIL import Image
 from vllm.distributed.parallel_state import cleanup_dist_env_and_memory
 
-from tests.utils import DeviceMemoryMonitor, hardware_test
+# from tests.utils import DeviceMemoryMonitor, hardware_test
+from tests.utils import DeviceMemoryMonitor
 from vllm_omni import Omni
 from vllm_omni.diffusion.data import DiffusionParallelConfig
 from vllm_omni.inputs.data import OmniDiffusionSamplingParams
diff --git a/vllm_omni/entrypoints/async_omni.py b/vllm_omni/entrypoints/async_omni.py
@@ -20,6 +20,7 @@
 from vllm_omni.distributed.omni_connectors.adapter import compute_talker_prompt_ids_length, try_send_via_connector
 from vllm_omni.distributed.ray_utils.utils import try_close_ray
 from vllm_omni.engine.input_processor import OmniInputProcessor
+from vllm_omni.entrypoints.cfg_companion_tracker import CfgCompanionTracker
 from vllm_omni.entrypoints.client_request_state import ClientRequestState
 from vllm_omni.entrypoints.omni import OmniBase
 from vllm_omni.entrypoints.omni_stage import OmniStage
@@ -28,7 +29,7 @@
 from vllm_omni.entrypoints.utils import (
     get_final_stage_id_for_e2e,
 )
-from vllm_omni.inputs.data import OmniPromptType, OmniSamplingParams
+from vllm_omni.inputs.data import OmniDiffusionSamplingParams, OmniPromptType, OmniSamplingParams
 
 # Internal imports (our code)
 from vllm_omni.lora.request import LoRARequest
@@ -125,6 +126,9 @@ def __init__(self, model: str, **kwargs: dict[str, Any]) -> None:
         # Used to avoid race condition between output_handler and collective_rpc
         self._rpc_results: dict[int, dict[str, dict[str, Any]]] = {}
 
+        # CFG companion → parent request ID mapping for output routing
+        self._companion_to_parent: dict[str, str] = {}
+
         super().__init__(model, **kwargs)
 
         # Register weak reference cleanup (called on garbage collection)
@@ -389,13 +393,38 @@ async def generate(
             req_state = ClientRequestState(request_id)
             req_state.metrics = metrics
             self.request_states[request_id] = req_state
+
+            # Ensure modalities is in the prompt dict for CFG expansion
+            # (offline path includes it; online serving passes it separately)
+            if isinstance(prompt, dict) and output_modalities and "modalities" not in prompt:
+                prompt["modalities"] = output_modalities
+
+            # CFG companion tracking (prompt expansion + lifecycle management)
+            cfg = CfgCompanionTracker(
+                prompt_expand_func=getattr(self.stage_list[0], "prompt_expand_func", None),
+                stage0_sampling_params=sampling_params_list[0],
+            )
+            expanded_companions = cfg.expand_prompts({request_id: prompt})
+
             sp0: SamplingParams = sampling_params_list[0]  # type: ignore[index]
             task = {
                 "request_id": request_id,
                 "engine_inputs": prompt,
                 "sampling_params": sp0,
             }
             self.stage_list[0].submit(task)
+
+            # Submit CFG companion requests to stage-0
+            if cfg.is_active:
+                for companion_id, companion_prompt in expanded_companions:
+                    self._companion_to_parent[companion_id] = request_id
+                    companion_task = {
+                        "request_id": companion_id,
+                        "engine_inputs": companion_prompt,
+                        "sampling_params": cfg.stage0_sampling_params,
+                    }
+                    self.stage_list[0].submit(companion_task)
+
             metrics.stage_first_ts[0] = metrics.stage_first_ts[0] or time.time()
             _req_start_ts[request_id] = time.time()
             logger.info(
@@ -421,6 +450,7 @@ async def generate(
                     final_stage_id_for_e2e,
                     sampling_params_list,
                     prompt,
+                    cfg=cfg,
                 ):
                     yield output
 
@@ -440,6 +470,9 @@ async def generate(
                 logger.exception(f"[{self._name}] Request {request_id} Failed to finalized/build/log summary: {e}")
             finally:
                 self.request_states.pop(request_id, None)
+                if cfg.is_active:
+                    for cid in cfg.get_companion_request_ids(request_id).values():
+                        self._companion_to_parent.pop(cid, None)
         except (asyncio.CancelledError, GeneratorExit):
             await self.abort(request_id)
             logger.info("[AsyncOrchestrator] Request %s aborted.", request_id)
@@ -603,12 +636,29 @@ async def _process_sequential_results(
         final_stage_id_for_e2e: int,
         sampling_params_list: list[SamplingParams],
         prompt: Any,
+        cfg: CfgCompanionTracker | None = None,
     ) -> AsyncGenerator[OmniRequestOutput, None]:
         for stage_id, stage in enumerate(self.stage_list[: final_stage_id_for_e2e + 1]):
+            cfg_stage0 = stage_id == 0 and cfg is not None and cfg.is_active
             finished = False
-            while not finished:
+
+            while True:
+                if finished and (
+                    not cfg_stage0 or cfg.all_companions_done(request_id) or cfg.is_parent_failed(request_id)
+                ):
+                    break
+
                 result = await req_state.queue.get()
-                assert stage_id == req_state.stage_id
+
+                if cfg is not None and cfg.is_companion(result.get("request_id", "")):
+                    if cfg_stage0:
+                        rid = result.get("request_id")
+                        if "error" in result:
+                            cfg.on_companion_error(rid)
+                        else:
+                            cfg.on_companion_completed(rid)
+                    continue
+
                 engine_outputs, finished, output_to_yield = self._process_single_result(
                     result,
                     stage,
@@ -629,6 +679,16 @@ async def _process_sequential_results(
                     next_inputs = next_stage.process_engine_inputs(self.stage_list, prompt)
                 sp_next: SamplingParams = sampling_params_list[next_stage_id]
 
+                if cfg is not None and cfg.is_active and not cfg.is_parent_failed(request_id):
+                    if isinstance(sp_next, OmniDiffusionSamplingParams):
+                        sp_next = copy.deepcopy(sp_next)
+                        sp_next.cfg_kv_request_ids = cfg.get_companion_request_ids(request_id)
+                        logger.info(
+                            "Attaching cfg_kv_request_ids=%s to request %s",
+                            sp_next.cfg_kv_request_ids,
+                            request_id,
+                        )
+
                 # Check if we have a connector for this edge
                 connector_key = (str(stage_id), str(next_stage_id))
                 connector = self.connectors.get(connector_key)
@@ -747,6 +807,7 @@ def _run_output_handler(self) -> None:
 
         stage_list = self.stage_list
         request_states = self.request_states
+        companion_to_parent = self._companion_to_parent
 
         async def output_handler():
             try:
@@ -773,6 +834,10 @@ async def output_handler():
                             continue
                         req_id = result.get("request_id")
                         req_state = request_states.get(req_id)
+                        if req_state is None:
+                            parent_id = companion_to_parent.get(req_id)
+                            if parent_id is not None:
+                                req_state = request_states.get(parent_id)
                         if req_state is None:
                             logger.debug(
                                 f"[{self._name}] Request may have been aborted; \
diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -311,17 +311,21 @@ async def create_chat_completion(
                 negative_prompt = extra_body.get("negative_prompt")
 
                 engine_prompt_image: dict[str, Any] | None = None
+                is_img2img = False
                 if reference_images:
                     # Best-effort decode first reference image for i2i.
                     try:
                         img_bytes = base64.b64decode(reference_images[0])
                         img = Image.open(BytesIO(img_bytes))
-                        engine_prompt_image = {"image": img}
+                        engine_prompt_image = {"img2img": img}
+                        is_img2img = True
                     except Exception:
                         engine_prompt_image = None
 
                 # Override the prompts produced by chat-template preprocessing.
                 tprompt: OmniTextPrompt = {"prompt": extracted_prompt}
+                if is_img2img:
+                    tprompt["modalities"] = ["img2img"]
                 if negative_prompt is not None:
                     tprompt["negative_prompt"] = negative_prompt
                 # GLM-Image's _call_hf_processor expects target_h/target_w in mm_processor_kwargs
@@ -490,10 +494,11 @@ async def _preprocess_chat(
             )
 
         # Preserve a clean text prompt for downstream stages (e.g., GLM-Image diffusion).
-        # For /v1/chat/completions, `request_prompt` is often the rendered chat template.
-        # Diffusion models generally want the raw user caption instead.
-        output_modalities = getattr(self.engine_client, "output_modalities", None)
-        if output_modalities and ("image" in output_modalities):
+        # For image generation, we want the raw user caption instead of a rendered template.
+        # But for multimodal comprehension (img2text), we MUST keep the rendered prompt
+        # containing image tokens.
+        req_modalities = getattr(request, "modalities", [])
+        if req_modalities and ("image" in req_modalities):
             messages_as_dicts: list[dict[str, Any]] = []
             for msg in messages:
                 if hasattr(msg, "model_dump"):
diff --git a/vllm_omni/model_executor/models/bagel/bagel.py b/vllm_omni/model_executor/models/bagel/bagel.py
@@ -202,6 +202,15 @@ def _get_subparsers(self):
 
 
 class OmniBagelMultiModalProcessor(BaseMultiModalProcessor[OmniBagelProcessingInfo]):
+    IMG2IMG_PLACEHOLDER = "<|fim_middle|>"
+
+    def _cached_apply_hf_processor(self, inputs, timing_ctx):
+        # img2img: prompt text must be modified based on mm data presence,
+        # so text and mm data cannot be tokenized separately — bypass cache.
+        if inputs.mm_data_items.get_all_counts().get("img2img", 0) > 0:
+            return self._apply_hf_processor(inputs, timing_ctx)
+        return super()._cached_apply_hf_processor(inputs, timing_ctx)
+
     def _get_mm_fields_config(self, hf_inputs, hf_processor_mm_kwargs):
         return {
             "pixel_values": MultiModalFieldConfig.batched("image"),
@@ -218,6 +227,9 @@ def _call_hf_processor(
         has_image = "images" in mm_data
         has_img2img = "pixel_values_img2img" in mm_data
 
+        if has_img2img and self.IMG2IMG_PLACEHOLDER not in prompt:
+            prompt = f"{self.IMG2IMG_PLACEHOLDER}{prompt}"
+
         if has_image and has_img2img:
             outputs = BatchFeature()
 
diff --git a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py
@@ -78,7 +78,11 @@
     MultiModalFieldConfig,
     MultiModalKwargsItems,
 )
-from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataItems
+from vllm.multimodal.parse import (
+    ImageProcessorItems,
+    MultiModalDataItems,
+    MultiModalDataParser,
+)
 from vllm.multimodal.processing import (
     BaseDummyInputsBuilder,
     BaseMultiModalProcessor,
@@ -115,6 +119,15 @@ class GlmImagePixelInputs(TensorSchema):
     image_grid_thw: Annotated[torch.Tensor, TensorShape("ni", 3)]
 
 
+class GlmImageDataParser(MultiModalDataParser):
+    """GLM-Image treats ``img2img`` input identically to ``image``."""
+
+    def _get_subparsers(self):
+        parsers = super()._get_subparsers()
+        parsers["img2img"] = self._parse_image_data
+        return parsers
+
+
 class GlmImageProcessingInfo(BaseProcessingInfo):
     """
     Processing information for GLM-Image model.
@@ -162,14 +175,19 @@ def get_hf_processor(self, **kwargs: object):
             **kwargs,
         )
 
+    def get_data_parser(self) -> GlmImageDataParser:
+        return GlmImageDataParser(
+            expected_hidden_size=self._get_expected_hidden_size(),
+        )
+
     def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         # GLM-Image is an image GENERATION model that supports:
         # - Text-to-image (t2i): no multimodal input needed
         # - Image-to-image (i2i): source images provided as input
         #
         # For i2i mode, we support up to 1 image as condition.
-        # The model architecture supports multiple images but typical usage is 1.
-        return {"image": 1}
+        # "img2img" is an alias used by the serving layer; parsed as "image".
+        return {"image": 1, "img2img": 1}
 
     def get_num_image_tokens(
         self,