[CI]: Add/Fix bagel e2e online/offline test (vllm-project#1895)

princepride · web-flow · commit 284575ab8231 · 2026-03-15T18:27:07.000+08:00
Signed-off-by: princepride &lt;wangzhipeng628@gmail.com&gt;
Signed-off-by: 汪志鹏 &lt;wangzhipeng628@gmail.com&gt;
diff --git a/.buildkite/test-ready.yml b/.buildkite/test-ready.yml
@@ -261,39 +261,117 @@ steps:
   #                 path: /mnt/hf-cache
   #                 type: DirectoryOrCreate
 
-  # - label: "Bagel Text2Img Model Test with H100"
-  #   depends_on: upload-ready-pipeline
-  #   commands:
-  #     - |
-  #       timeout 30m bash -c '
-  #         export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  #         pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py
-  #       '
-  #   agents:
-  #     queue: "mithril-h100-pool"
-  #   plugins:
-  #     - kubernetes:
-  #         podSpec:
-  #           containers:
-  #             - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-  #               resources:
-  #                 limits:
-  #                   nvidia.com/gpu: 1
-  #               volumeMounts:
-  #                 - name: devshm
-  #                   mountPath: /dev/shm
-  #                 - name: hf-cache
-  #                   mountPath: /root/.cache/huggingface
-  #               env:
-  #                 - name: HF_HOME
-  #                   value: /root/.cache/huggingface
-  #           nodeSelector:
-  #             node.kubernetes.io/instance-type: gpu-h100-sxm
-  #           volumes:
-  #             - name: devshm
-  #               emptyDir:
-  #                 medium: Memory
-  #             - name: hf-cache
-  #               hostPath:
-  #                 path: /mnt/hf-cache
-  #                 type: DirectoryOrCreate
+  - label: "Bagel Text2Img Model Test with H100"
+    depends_on: upload-ready-pipeline
+    commands:
+      - |
+        timeout 30m bash -c '
+          export VLLM_WORKER_MULTIPROC_METHOD=spawn
+          export VLLM_TEST_CLEAN_GPU_MEMORY=1
+          pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py
+        '
+    agents:
+      queue: "mithril-h100-pool"
+    plugins:
+      - kubernetes:
+          podSpec:
+            containers:
+              - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+                resources:
+                  limits:
+                    nvidia.com/gpu: 1
+                volumeMounts:
+                  - name: devshm
+                    mountPath: /dev/shm
+                  - name: hf-cache
+                    mountPath: /root/.cache/huggingface
+                env:
+                  - name: HF_HOME
+                    value: /root/.cache/huggingface
+            nodeSelector:
+              node.kubernetes.io/instance-type: gpu-h100-sxm
+            volumes:
+              - name: devshm
+                emptyDir:
+                  medium: Memory
+              - name: hf-cache
+                hostPath:
+                  path: /mnt/hf-cache
+                  type: DirectoryOrCreate
+
+  - label: "Bagel Img2Img Model Test with H100"
+    depends_on: upload-ready-pipeline
+    commands:
+      - |
+        timeout 30m bash -c '
+          export VLLM_WORKER_MULTIPROC_METHOD=spawn
+          export VLLM_TEST_CLEAN_GPU_MEMORY=1
+          pytest -s -v tests/e2e/offline_inference/test_bagel_img2img.py
+        '
+    agents:
+      queue: "mithril-h100-pool"
+    plugins:
+      - kubernetes:
+          podSpec:
+            containers:
+              - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+                resources:
+                  limits:
+                    nvidia.com/gpu: 1
+                volumeMounts:
+                  - name: devshm
+                    mountPath: /dev/shm
+                  - name: hf-cache
+                    mountPath: /root/.cache/huggingface
+                env:
+                  - name: HF_HOME
+                    value: /root/.cache/huggingface
+            nodeSelector:
+              node.kubernetes.io/instance-type: gpu-h100-sxm
+            volumes:
+              - name: devshm
+                emptyDir:
+                  medium: Memory
+              - name: hf-cache
+                hostPath:
+                  path: /mnt/hf-cache
+                  type: DirectoryOrCreate
+
+  - label: "Bagel Online Serving Test with H100"
+    depends_on: upload-ready-pipeline
+    commands:
+      - |
+        timeout 40m bash -c '
+          export VLLM_WORKER_MULTIPROC_METHOD=spawn
+          export VLLM_TEST_CLEAN_GPU_MEMORY=1
+          export VLLM_IMAGE_FETCH_TIMEOUT=60
+          pytest -s -v tests/e2e/online_serving/test_bagel_online.py
+        '
+    agents:
+      queue: "mithril-h100-pool"
+    plugins:
+      - kubernetes:
+          podSpec:
+            containers:
+              - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+                resources:
+                  limits:
+                    nvidia.com/gpu: 1
+                volumeMounts:
+                  - name: devshm
+                    mountPath: /dev/shm
+                  - name: hf-cache
+                    mountPath: /root/.cache/huggingface
+                env:
+                  - name: HF_HOME
+                    value: /root/.cache/huggingface
+            nodeSelector:
+              node.kubernetes.io/instance-type: gpu-h100-sxm
+            volumes:
+              - name: devshm
+                emptyDir:
+                  medium: Memory
+              - name: hf-cache
+                hostPath:
+                  path: /mnt/hf-cache
+                  type: DirectoryOrCreate
diff --git a/tests/e2e/offline_inference/stage_configs/bagel_sharedmemory_ci.yaml b/tests/e2e/offline_inference/stage_configs/bagel_sharedmemory_ci.yaml
@@ -10,7 +10,7 @@ stage_args:
       max_batch_size: 1
     engine_args:
       model_stage: thinker
-      model_arch: BagelForConditionalGeneration
+      model_arch: OmniBagelForConditionalGeneration
       worker_type: ar
       scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
       gpu_memory_utilization: 0.45
diff --git a/tests/e2e/offline_inference/test_bagel_img2img.py b/tests/e2e/offline_inference/test_bagel_img2img.py
@@ -0,0 +1,184 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+End-to-end test for Bagel img2img generation.
+
+This test validates that the Bagel model generates images from an input image
+and text prompt that match expected reference pixel values within a ±5 tolerance.
+
+Equivalent to running:
+    python3 examples/offline_inference/bagel/end2end.py \
+        --prompts "Change the grass color to red" \
+        --modality img2img --step 15 \
+        --image-path 2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg
+"""
+
+import socket
+from pathlib import Path
+from typing import Any
+
+import pytest
+from PIL import Image
+from vllm.assets.image import ImageAsset
+
+from tests.utils import hardware_test
+from vllm_omni.entrypoints.omni import Omni
+
+# Reference pixel data extracted from the known-good output image
+# Generated with seed=52, num_inference_steps=15,
+# prompt='Change the grass color to red',
+# input image: 2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg
+REFERENCE_PIXELS = [
+    {"position": (100, 100), "rgb": (157, 172, 217)},
+    {"position": (400, 50), "rgb": (105, 144, 218)},
+    {"position": (700, 100), "rgb": (118, 159, 233)},
+    {"position": (150, 400), "rgb": (195, 34, 60)},
+    {"position": (512, 336), "rgb": (222, 214, 193)},
+    {"position": (700, 400), "rgb": (197, 15, 43)},
+    {"position": (100, 600), "rgb": (105, 13, 18)},
+    {"position": (400, 600), "rgb": (169, 33, 44)},
+    {"position": (700, 600), "rgb": (101, 86, 93)},
+    {"position": (256, 256), "rgb": (181, 202, 222)},
+]
+
+PIXEL_TOLERANCE = 5
+
+DEFAULT_PROMPT = "<|fim_middle|><|im_start|>Change the grass color to red<|im_end|>"
+
+EXPECTED_OUTPUT_SIZE = (1024, 672)
+
+
+def _load_input_image() -> Image.Image:
+    """Load the test input image via vllm's ImageAsset."""
+    return ImageAsset("2560px-Gfp-wisconsin-madison-the-nature-boardwalk").pil_image.convert("RGB")
+
+
+def _find_free_port() -> int:
+    """Find and return a free ephemeral port by binding to port 0."""
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("127.0.0.1", 0))
+        s.listen(1)
+        port = s.getsockname()[1]
+    return port
+
+
+def _configure_sampling_params(omni: Omni, max_tokens: int = 1, num_inference_steps: int = 15) -> list:
+    """Configure sampling parameters for Bagel img2img generation.
+
+    Args:
+        omni: The Omni instance to get default params from.
+        max_tokens: Maximum tokens for the first stage.
+        num_inference_steps: Number of inference steps for the diffusion stage.
+
+    Returns:
+        Configured sampling params list.
+    """
+    params_list = omni.default_sampling_params_list
+    params_list[0].max_tokens = max_tokens  # type: ignore
+    if len(params_list) > 1:
+        params_list[1].num_inference_steps = num_inference_steps  # type: ignore
+        params_list[1].extra_args = {  # type: ignore
+            "cfg_text_scale": 4.0,
+            "cfg_img_scale": 1.5,
+        }
+    return params_list
+
+
+def _extract_generated_image(omni_outputs: list) -> Image.Image | None:
+    """Extract the generated image from Omni outputs.
+
+    Args:
+        omni_outputs: List of outputs from omni.generate().
+
+    Returns:
+        The first generated PIL Image, or None if no image found.
+    """
+    for req_output in omni_outputs:
+        if images := getattr(req_output, "images", None):
+            return images[0]
+        if hasattr(req_output, "request_output") and req_output.request_output:
+            for stage_out in req_output.request_output:
+                if hasattr(stage_out, "images") and stage_out.images:
+                    return stage_out.images[0]
+    return None
+
+
+def _validate_pixels(
+    image: Image.Image,
+    reference_pixels: list[dict[str, Any]] = REFERENCE_PIXELS,
+    tolerance: int = PIXEL_TOLERANCE,
+) -> None:
+    """Validate that image pixels match expected reference values.
+
+    Args:
+        image: The PIL Image to validate.
+        reference_pixels: List of dicts with 'position' (x, y) and 'rgb' (R, G, B).
+        tolerance: Maximum allowed difference per color channel.
+
+    Raises:
+        AssertionError: If any pixel differs beyond tolerance.
+    """
+    for ref in reference_pixels:
+        x, y = ref["position"]
+        expected = ref["rgb"]
+        actual = image.getpixel((x, y))[:3]
+        assert all(abs(a - e) <= tolerance for a, e in zip(actual, expected)), (
+            f"Pixel mismatch at ({x}, {y}): expected {expected}, got {actual}"
+        )
+
+
+def _generate_bagel_img2img(
+    omni: Omni,
+    input_image: Image.Image,
+    prompt: str = DEFAULT_PROMPT,
+) -> Image.Image:
+    """Generate an image using Bagel model with img2img pipeline.
+
+    Args:
+        omni: The Omni instance to use for generation.
+        input_image: The input PIL Image for img2img.
+        prompt: The text prompt for image editing.
+
+    Returns:
+        The generated PIL Image.
+
+    Raises:
+        AssertionError: If no image is generated or size is incorrect.
+    """
+    params_list = _configure_sampling_params(omni)
+
+    omni_outputs = list(
+        omni.generate(
+            prompts=[
+                {
+                    "prompt": prompt,
+                    "multi_modal_data": {"img2img": input_image},
+                    "modalities": ["img2img"],
+                }
+            ],
+            sampling_params_list=params_list,
+        )
+    )
+
+    generated_image = _extract_generated_image(omni_outputs)
+    assert generated_image is not None, "No images generated"
+    assert generated_image.size == EXPECTED_OUTPUT_SIZE, f"Expected {EXPECTED_OUTPUT_SIZE}, got {generated_image.size}"
+
+    return generated_image
+
+
+@pytest.mark.core_model
+@pytest.mark.diffusion
+@hardware_test(res={"cuda": "H100"})
+def test_bagel_img2img_shared_memory_connector():
+    """Test Bagel img2img with shared memory connector."""
+    input_image = _load_input_image()
+    config_path = str(Path(__file__).parent / "stage_configs" / "bagel_sharedmemory_ci.yaml")
+    omni = Omni(model="ByteDance-Seed/BAGEL-7B-MoT", stage_configs_path=config_path, stage_init_timeout=300)
+
+    try:
+        generated_image = _generate_bagel_img2img(omni, input_image)
+        _validate_pixels(generated_image)
+    finally:
+        omni.close()
diff --git a/tests/e2e/offline_inference/test_bagel_text2img.py b/tests/e2e/offline_inference/test_bagel_text2img.py
@@ -14,10 +14,6 @@
 """
 
 import os
-
-os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
-os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1"
-
 import signal
 import socket
 import subprocess
@@ -37,23 +33,23 @@
 # "Generated with seed=52, num_inference_steps=15,
 # prompt='A futuristic city skyline at twilight, cyberpunk style'"
 REFERENCE_PIXELS = [
-    {"position": (100, 100), "rgb": (49, 96, 134)},
-    {"position": (400, 50), "rgb": (63, 127, 167)},
-    {"position": (700, 100), "rgb": (70, 101, 141)},
-    {"position": (150, 400), "rgb": (115, 90, 150)},
-    {"position": (512, 512), "rgb": (98, 86, 119)},
-    {"position": (700, 400), "rgb": (29, 42, 91)},
-    {"position": (100, 700), "rgb": (47, 50, 88)},
-    {"position": (400, 700), "rgb": (36, 52, 91)},
-    {"position": (700, 700), "rgb": (45, 58, 99)},
-    {"position": (256, 256), "rgb": (62, 94, 135)},
+    {"position": (100, 100), "rgb": (121, 118, 100)},
+    {"position": (400, 50), "rgb": (163, 162, 143)},
+    {"position": (700, 100), "rgb": (170, 156, 127)},
+    {"position": (150, 400), "rgb": (129, 127, 112)},
+    {"position": (512, 512), "rgb": (135, 61, 59)},
+    {"position": (700, 400), "rgb": (205, 107, 43)},
+    {"position": (100, 700), "rgb": (197, 177, 157)},
+    {"position": (400, 700), "rgb": (139, 107, 86)},
+    {"position": (700, 700), "rgb": (247, 205, 146)},
+    {"position": (256, 256), "rgb": (171, 160, 153)},
 ]
 
 # Maximum allowed difference per color channel
 PIXEL_TOLERANCE = 5
 
 # Default test prompt
-DEFAULT_PROMPT = "<|im_start|>A futuristic city skyline at twilight, cyberpunk style<|im_end|>"
+DEFAULT_PROMPT = "<|im_start|>A cute cat<|im_end|>"
 
 
 def _find_free_port() -> int:
diff --git a/tests/e2e/online_serving/test_bagel_online.py b/tests/e2e/online_serving/test_bagel_online.py