EmbeddedLLM
diff --git a/‎apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/utils/api_client.py‎
Lines changed: 69 additions & 27 deletions b/‎apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/utils/api_client.py‎
Lines changed: 69 additions & 27 deletions
diff --git a/‎apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/utils/format.py‎
Lines changed: 13 additions & 9 deletions b/‎apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/utils/format.py‎
Lines changed: 13 additions & 9 deletions
diff --git a/‎apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/utils/validators.py‎
Lines changed: 5 additions & 1 deletion b/‎apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/utils/validators.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎docs/user_guide/examples/online_serving/image_to_video.md‎
Lines changed: 116 additions & 2 deletions b/‎docs/user_guide/examples/online_serving/image_to_video.md‎
Lines changed: 116 additions & 2 deletions
@@ -6,6 +6,7 @@
 Original source at https://github.com/dougbtv/comfyui-vllm-omni, distributed under the MIT License.
 """
 
+import asyncio
 import json
 from typing import Any
 
@@ -18,8 +19,8 @@
     audio_to_base64,
     base64_to_audio,
     base64_to_image_tensor,
-    base64_to_video,
     bytes_to_audio,
+    bytes_to_video,
     image_tensor_to_base64,
     image_tensor_to_png_bytes,
     video_to_base64,
@@ -31,10 +32,43 @@
 logger = get_logger(__name__)
 
 
+async def url_json(session: aiohttp.ClientSession, url: str, verb: str = "get", **kwargs) -> dict[str, Any]:
+    try:
+        async with getattr(session, verb)(url, **kwargs) as response:
+            if not response.ok:
+                error_text = await response.text()
+                raise (ValueError if response.status < 500 else RuntimeError)(
+                    f"vLLM-Omni API returned status {response.status}: {error_text}"
+                )
+            try:
+                return await response.json()
+            except aiohttp.ContentTypeError as e:
+                raise RuntimeError(f"Invalid JSON response from vLLM-Omni: {e}")
+    except aiohttp.ClientError as e:
+        raise RuntimeError(f"Network error connecting to vLLM-Omni at {url}: {e}")
+
+
+async def url_bytes(session: aiohttp.ClientSession, url: str, verb: str = "get", **kwargs) -> bytes:
+    try:
+        async with getattr(session, verb)(url, **kwargs) as response:
+            if not response.ok:
+                error_text = await response.text()
+                raise (ValueError if response.status < 500 else RuntimeError)(
+                    f"vLLM-Omni API returned status {response.status}: {error_text}"
+                )
+            return await response.read()
+    except aiohttp.ClientError as e:
+        raise RuntimeError(f"Network error connecting to vLLM-Omni at {url}: {e}")
+
+
 class VLLMOmniClient:
-    def __init__(self, base_url: str, timeout: float | None = None):
+    def __init__(
+        self, base_url: str, timeout: float | None = None, poll_interval: float = 5.0, max_poll_duration: float = 60 * 5
+    ):
         self.base_url = base_url
         self.timeout = aiohttp.ClientTimeout(total=timeout)
+        self.poll_interval = poll_interval
+        self.max_poll_duration = max_poll_duration
 
     async def generate_image(
         self,
@@ -262,33 +296,41 @@ async def generate_video(
                 content_type="image/png",
             )
 
-        url = self.base_url + "/videos"
         async with aiohttp.ClientSession(timeout=self.timeout) as session:
+            # Start the video generation job
+            url = f"{self.base_url}/videos"
+            data = await url_json(session, url, "post", data=form)
+            if (job_id := data.get("id", None)) is None:
+                raise RuntimeError("API response missing job 'id' field - expected OpenAI compliant format")
+            if (job_status := data.get("status", None)) is None:
+                raise RuntimeError("API response missing job 'status' field - expected OpenAI compliant format")
+
+            # Poll for video generation job completion
+            deadline = asyncio.get_running_loop().time() + self.max_poll_duration
+            url = f"{self.base_url}/videos/{job_id}"
+            while job_status not in {"completed", "failed"}:
+                await asyncio.sleep(self.poll_interval)
+
+                data = await url_json(session, url)
+                if (job_status := data.get("status", None)) is None:
+                    raise RuntimeError("API response missing job 'status' field - expected OpenAI compliant format")
+                if asyncio.get_running_loop().time() >= deadline:
+                    raise RuntimeError(f"Timed out waiting for video job {job_id} to complete")
+
+            if job_status == "failed":
+                raise RuntimeError(f"Video job failed: {data}")
+
+            # Retrieve completed content
+            video_bytes = await url_bytes(session, f"{url}/content")
+
+            # Decode video and make a best effort at cleaning up server resources
             try:
-                async with session.post(url, data=form) as response:
-                    if not response.ok:
-                        error_text = await response.text()
-                        raise (ValueError if response.status < 500 else RuntimeError)(
-                            f"vLLM-Omni API returned status {response.status}: {error_text}"
-                        )
-
-                    try:
-                        data = await response.json()
-                    except aiohttp.ContentTypeError as e:
-                        raise RuntimeError(f"Invalid JSON response from vLLM-Omni: {e}")
-            except aiohttp.ClientError as e:
-                raise RuntimeError(f"Network error connecting to vLLM-Omni at {url}: {e}")
-
-        if "data" not in data:
-            raise RuntimeError("API response missing 'data' field - expected OpenAI DALL-E format")
-        if not data["data"]:
-            raise RuntimeError("API returned empty data array")
-        try:
-            base64_str = data["data"][0]["b64_json"]
-        except (KeyError, IndexError):
-            raise RuntimeError("API response missing 'b64_json' field in first data item")
-
-        return base64_to_video(base64_str)
+                return bytes_to_video(video_bytes)
+            finally:
+                try:
+                    await url_json(session, url, "delete")
+                except Exception as exc:
+                    logger.warning("Failed to clean up video job %s: %s", job_id, exc)
 
     async def generate_understanding_chat_completion(
         self,
 
@@ -143,15 +143,7 @@ def video_to_base64(video: VideoInput, filename: str = "video.mp4") -> str:
     return f"data:{mime_type};base64,{base64_str}"
 
 
-def base64_to_video(base64_str: str) -> VideoInput:
-    if base64_str.startswith("data:video"):
-        _, base64_str = base64_str.split(",", 1)
-
-    try:
-        video_bytes = base64.b64decode(base64_str)
-    except Exception as e:
-        raise ValueError(f"Invalid base64 string: {e}")
-
+def bytes_to_video(video_bytes: bytes) -> VideoInput:
     video_buffer = BytesIO(video_bytes)
 
     try:
@@ -208,6 +200,18 @@ def base64_to_video(base64_str: str) -> VideoInput:
     return InputImpl.VideoFromComponents(components)
 
 
+def base64_to_video(base64_str: str) -> VideoInput:
+    if base64_str.startswith("data:video"):
+        _, base64_str = base64_str.split(",", 1)
+
+    try:
+        video_bytes = base64.b64decode(base64_str)
+    except Exception as e:
+        raise ValueError(f"Invalid base64 string: {e}")
+
+    return bytes_to_video(video_bytes)
+
+
 def audio_to_bytes(audio: AudioInput, filename: str = "audio.mp3", quality: str = "128k") -> BytesIO:
     waveform = audio["waveform"][0]  # Shape: (C, T)
     sample_rate = audio["sample_rate"]
 
@@ -70,13 +70,17 @@ def add_sampling_parameters_to_stage(
 
     stages = pipeline_spec["stages"]
     if isinstance(sampling_param_list, dict):
+        sampling_param_list = sampling_param_list.__class__(sampling_param_list)
         sampling_param_list.update(params_to_add)
     elif sampling_param_list is None:
         sampling_param_list = params_to_add.copy()
     else:
         for i, stage in enumerate(stages):
             if stage == stage_type:
-                sampling_param_list[i].update(params_to_add)
+                stage_param = sampling_param_list[i]
+                stage_param = stage_param.__class__(stage_param)
+                stage_param.update(params_to_add)
+                sampling_param_list[i] = stage_param
 
     return sampling_param_list
 
 
@@ -29,6 +29,35 @@ The script allows overriding:
 - `CACHE_BACKEND` (default: `none`)
 - `ENABLE_CACHE_DIT_SUMMARY` (default: `0`)
 
+## Async Job Behavior
+
+`POST /v1/videos` is asynchronous. It creates a video job and immediately
+returns metadata like the job ID and initial `queued` status. To get the final
+artifact, poll the job status and then download the completed file from the
+content endpoint.
+
+The main endpoints are:
+- `POST /v1/videos`: create a video generation job
+- `GET /v1/videos/{video_id}`: retrieve the current job status and metadata
+- `GET /v1/videos`: list stored video jobs
+- `GET /v1/videos/{video_id}/content`: download the generated video file
+- `DELETE /v1/videos/{video_id}`: delete the job and any stored output
+
+## Storage
+
+Generated video files are stored on local disk by the async video API.
+Local file storage behavior can be controlled via the following environment variables:
+
+- `VLLM_OMNI_STORAGE_PATH`: directory used for generated files (default: `/tmp/storage`)
+- `VLLM_OMNI_STORAGE_MAX_CONCURRENCY`: max concurrent save/delete operations (default: `4`)
+
+Example:
+
+```bash
+export VLLM_OMNI_STORAGE_PATH=/var/tmp/vllm-omni-videos
+export VLLM_OMNI_STORAGE_MAX_CONCURRENCY=8
+```
+
 ## API Calls
 
 ### Method 1: Using curl
@@ -38,7 +67,7 @@ The script allows overriding:
 bash run_curl_image_to_video.sh
 
 # Or execute directly (OpenAI-style multipart)
-curl -X POST http://localhost:8091/v1/videos \
+create_response=$(curl -s http://localhost:8091/v1/videos \
   -H "Accept: application/json" \
   -F "prompt=A bear playing with yarn, smooth motion" \
   -F "negative_prompt=low quality, blurry, static" \
@@ -52,7 +81,23 @@ curl -X POST http://localhost:8091/v1/videos \
   -F "guidance_scale_2=1.0" \
   -F "boundary_ratio=0.875" \
   -F "flow_shift=12.0" \
-  -F "seed=42" | jq -r '.data[0].b64_json' | base64 -d > wan22_i2v_output.mp4
+  -F "seed=42")
+
+video_id=$(echo "$create_response" | jq -r '.id')
+while true; do
+  status=$(curl -s "http://localhost:8091/v1/videos/${video_id}" | jq -r '.status')
+  if [ "$status" = "completed" ]; then
+    break
+  fi
+  if [ "$status" = "failed" ]; then
+    echo "Video generation failed"
+    exit 1
+  fi
+  sleep 2
+done
+
+curl -s "http://localhost:8091/v1/videos/${video_id}" | jq .
+curl -L "http://localhost:8091/v1/videos/${video_id}/content" -o wan22_i2v_output.mp4
 ```
 
 ## Request Format
@@ -66,6 +111,18 @@ curl -X POST http://localhost:8091/v1/videos \
   -F "input_reference=@/path/to/qwen-bear.png"
 ```
 
+### Alternative JSON-Safe Reference Input
+
+Use `image_reference` when you want to pass a URL or JSON-safe image reference
+instead of uploading a file. Do not send `input_reference` and
+`image_reference` together.
+
+```bash
+curl -X POST http://localhost:8091/v1/videos \
+  -F "prompt=A bear playing with yarn, smooth motion" \
+  -F 'image_reference={"image_url":"https://example.com/qwen-bear.png"}'
+```
+
 ### Generation with Parameters
 
 ```bash
@@ -85,6 +142,63 @@ curl -X POST http://localhost:8091/v1/videos \
   -F "seed=42"
 ```
 
+## Create Response Format
+
+`POST /v1/videos` returns a job record.
+
+```json
+{
+  "id": "video_gen_123",
+  "object": "video",
+  "status": "queued",
+  "model": "Wan-AI/Wan2.2-I2V-A14B-Diffusers",
+  "prompt": "A bear playing with yarn, smooth motion",
+  "created_at": 1234567890
+}
+```
+
+## Retrieve, List, Download, and Delete
+
+### Retrieve a job
+
+```bash
+curl -s http://localhost:8091/v1/videos/${video_id} | jq .
+```
+
+### List jobs
+
+```bash
+curl -s http://localhost:8091/v1/videos | jq .
+```
+
+### Download the completed video
+
+```bash
+curl -L http://localhost:8091/v1/videos/${video_id}/content -o wan22_i2v_output.mp4
+```
+
+### Delete a job and its stored file
+
+```bash
+curl -X DELETE http://localhost:8091/v1/videos/${video_id} | jq .
+```
+
+## Poll Until Complete
+
+```bash
+while true; do
+  status=$(curl -s http://localhost:8091/v1/videos/${video_id} | jq -r '.status')
+  if [ "$status" = "completed" ]; then
+    break
+  fi
+  if [ "$status" = "failed" ]; then
+    echo "Video generation failed"
+    exit 1
+  fi
+  sleep 2
+done
+```
+
 ## Example materials
 
 ??? abstract "run_curl_image_to_video.sh"