vllm-project
diff --git a/‎.buildkite/pipeline.yml‎
Lines changed: 35 additions & 0 deletions b/‎.buildkite/pipeline.yml‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 2 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎tests/e2e/offline_inference/stage_configs/bagel_mooncake_ci.yaml‎
Lines changed: 85 additions & 0 deletions b/‎tests/e2e/offline_inference/stage_configs/bagel_mooncake_ci.yaml‎
Lines changed: 85 additions & 0 deletions
diff --git a/‎tests/e2e/offline_inference/stage_configs/bagel_sharedmemory_ci.yaml‎
Lines changed: 83 additions & 0 deletions b/‎tests/e2e/offline_inference/stage_configs/bagel_sharedmemory_ci.yaml‎
Lines changed: 83 additions & 0 deletions
@@ -305,3 +305,38 @@ steps:
                 hostPath:
                   path: /mnt/hf-cache
                   type: DirectoryOrCreate
+
+  - label: "Bagel Text2Img Model Test with H100"
+    timeout_in_minutes: 30
+    depends_on: image-build
+    commands:
+      - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+      - pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py
+    agents:
+      queue: "mithril-h100-pool"
+    plugins:
+      - kubernetes:
+          podSpec:
+            containers:
+              - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+                resources:
+                  limits:
+                    nvidia.com/gpu: 1
+                volumeMounts:
+                  - name: devshm
+                    mountPath: /dev/shm
+                  - name: hf-cache
+                    mountPath: /root/.cache/huggingface
+                env:
+                  - name: HF_HOME
+                    value: /root/.cache/huggingface
+            nodeSelector:
+              node.kubernetes.io/instance-type: gpu-h100-sxm
+            volumes:
+              - name: devshm
+                emptyDir:
+                  medium: Memory
+              - name: hf-cache
+                hostPath:
+                  path: /mnt/hf-cache
+                  type: DirectoryOrCreate
@@ -59,7 +59,8 @@ dev = [
     "psutil>=7.2.0",
     "soundfile>=0.13.1",
     "imageio[ffmpeg]>=0.6.0",
-    "opencv-python>=4.12.0.88"
+    "opencv-python>=4.12.0.88",
+    "mooncake-transfer-engine==0.3.8.post1"
 ]
 
 docs = [
 
@@ -0,0 +1,85 @@
+# stage config for running BAGEL with Mooncake connector for CI e2e tests.
+# This config is optimized for single GPU tests with Mooncake inter-stage communication.
+
+stage_args:
+  - stage_id: 0
+    stage_type: llm
+    runtime:
+      devices: "0"
+      max_batch_size: 1
+    engine_args:
+      model_stage: thinker
+      model_arch: BagelForConditionalGeneration
+      worker_type: ar
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      gpu_memory_utilization: 0.60
+      enforce_eager: true
+      trust_remote_code: true
+      engine_output_type: text
+      distributed_executor_backend: mp
+      enable_prefix_caching: false
+      max_num_batched_tokens: 32768
+      tensor_parallel_size: 1
+      omni_kv_config:
+        need_send_cache: true
+        kv_transfer_criteria:
+          type: prefill_finished
+    final_output: true
+    final_output_type: text
+    is_comprehension: true
+    default_sampling_params:
+      temperature: 0.4
+      top_p: 0.9
+      top_k: 1
+      max_tokens: 2048
+      seed: 52
+      detokenize: true
+      repetition_penalty: 1.05
+    output_connectors:
+      to_stage_1: mooncake_connector
+  - stage_id: 1
+    stage_type: diffusion
+    runtime:
+      devices: "0"
+      max_batch_size: 1
+    engine_args:
+      model_stage: dit
+      gpu_memory_utilization: 0.4
+      enforce_eager: true
+      trust_remote_code: true
+      engine_output_type: image
+      distributed_executor_backend: mp
+      enable_prefix_caching: false
+      max_num_batched_tokens: 32768
+      tensor_parallel_size: 1
+      omni_kv_config:
+        need_recv_cache: true
+    engine_input_source: [0]
+    final_output: true
+    final_output_type: image
+    is_comprehension: false
+    default_sampling_params:
+      seed: 52
+    input_connectors:
+      from_stage_0: mooncake_connector
+
+# Top-level runtime config with Mooncake connector
+runtime:
+  enabled: true
+  defaults:
+    window_size: -1
+    max_inflight: 1
+  connectors:
+    mooncake_connector:
+      name: MooncakeConnector
+      extra:
+        host: "${MOONCAKE_HOST}"
+        metadata_server: "http://${MOONCAKE_HOST}:${MOONCAKE_HTTP_PORT}/metadata"
+        master: "${MOONCAKE_HOST}:${MOONCAKE_RPC_PORT}"
+        segment: 64000000
+        localbuf: 64000000
+        proto: tcp
+  edges:
+    - from: 0
+      to: 1
+      window_size: -1
@@ -0,0 +1,83 @@
+# stage config for running BAGEL with SharedMemory connector for CI e2e tests.
+# This config is optimized for single GPU tests with SharedMemory inter-stage communication.
+
+stage_args:
+  - stage_id: 0
+    stage_type: llm
+    runtime:
+      devices: "0"
+      max_batch_size: 1
+    engine_args:
+      model_stage: thinker
+      model_arch: BagelForConditionalGeneration
+      worker_type: ar
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      gpu_memory_utilization: 0.65
+      enforce_eager: true
+      trust_remote_code: true
+      engine_output_type: text
+      distributed_executor_backend: "mp"
+      enable_prefix_caching: false
+      max_num_batched_tokens: 32768
+      tensor_parallel_size: 1
+      omni_kv_config:
+        need_send_cache: true
+        kv_transfer_criteria:
+          type: prefill_finished #or special token generated
+    final_output: true
+    final_output_type: text
+    is_comprehension: true
+    default_sampling_params:
+      temperature: 0.4
+      top_p: 0.9
+      top_k: 1
+      max_tokens: 2048
+      seed: 52
+      detokenize: True
+      repetition_penalty: 1.05
+
+  - stage_id: 1
+    stage_type: diffusion
+    runtime:
+      devices: "0"
+      max_batch_size: 1
+    engine_args:
+      model_stage: dit
+      gpu_memory_utilization: 0.4
+      enforce_eager: true
+      trust_remote_code: true
+      engine_output_type: image
+      distributed_executor_backend: "mp"
+      enable_prefix_caching: false
+      max_num_batched_tokens: 32768
+      tensor_parallel_size: 1
+      omni_kv_config:
+        need_recv_cache: true
+    engine_input_source: [0]
+
+    final_output: true
+    final_output_type: image
+    is_comprehension: false
+    default_sampling_params:
+      seed: 52
+
+# Runtime edges
+runtime:
+  enabled: true
+  defaults:
+    window_size: -1
+    max_inflight: 1
+
+  # Distributed connectors configuration (optional)
+  # More connectors will be supported in the future.
+  connectors:
+    shared_memory_connector:
+      name: SharedMemoryConnector
+      extra:
+        shm_threshold_bytes: 65536 # 64KB threshold
+
+
+  edges:
+    - from: 0
+      to: 1
+      window_size: -1
Original file line number	Diff line number	Diff line change
`@@ -59,7 +59,8 @@ dev = [`
`59`	`59`	`"psutil>=7.2.0",`
`60`	`60`	`"soundfile>=0.13.1",`
`61`	`61`	`"imageio[ffmpeg]>=0.6.0",`
`62`		`- "opencv-python>=4.12.0.88"`
	`62`	`+ "opencv-python>=4.12.0.88",`
	`63`	`+ "mooncake-transfer-engine==0.3.8.post1"`
`63`	`64`	`]`
`64`	`65`
`65`	`66`	`docs = [`