feat: support seedance 1.5 pro and sleep with max_wait_seconds (#406)

zakahan · web-flow · commit 5bd0ea60e2e7 · 2025-12-26T17:56:12.000+08:00
* chore: support seedance 1.5 pro

* chore: video generate sleep with `max_wait_seconds`

* fix: timeout of image generate

* fix: config yaml

* fix: consts
diff --git a/config.yaml.full b/config.yaml.full
@@ -20,7 +20,7 @@ model:
     api_base: https://ark.cn-beijing.volces.com/api/v3/
     api_key:
   video:
-    name: doubao-seedance-1-0-pro-250528
+    name: doubao-seedance-1-5-pro-251215
     api_base: https://ark.cn-beijing.volces.com/api/v3/
     api_key: 
   image:
diff --git a/pyproject.toml b/pyproject.toml
@@ -22,8 +22,8 @@ dependencies = [
     "opentelemetry-instrumentation-logging>=0.56b0",
     "wrapt==1.17.2", # For patching built-in functions
     "openai<1.100", # For fix https://github.com/BerriAI/litellm/issues/13710
-    "volcengine-python-sdk==4.0.33", # For Volcengine API
-    "volcengine==1.0.193", # For Volcengine sign
+    "volcengine-python-sdk>=5.0.1", # For Volcengine API
+    "volcengine>=1.0.193", # For Volcengine sign
     "agent-pilot-sdk==0.1.2", # Prompt optimization by Volcengine AgentPilot/PromptPilot toolkits
     "fastmcp==2.12.3", # For running MCP
     "trustedmcp==0.0.5", # For running TrustedMCP
diff --git a/veadk/consts.py b/veadk/consts.py
@@ -65,7 +65,7 @@
 DEFAULT_IMAGE_EDIT_MODEL_NAME = "doubao-seededit-3-0-i2i-250628"
 DEFAULT_IMAGE_EDIT_MODEL_API_BASE = "https://ark.cn-beijing.volces.com/api/v3/"
 
-DEFAULT_VIDEO_MODEL_NAME = "doubao-seedance-1-0-pro-250528"
+DEFAULT_VIDEO_MODEL_NAME = "doubao-seedance-1-5-pro-251215"
 DEFAULT_VIDEO_MODEL_API_BASE = "https://ark.cn-beijing.volces.com/api/v3/"
 
 DEFAULT_IMAGE_GENERATE_MODEL_NAME = "doubao-seedream-4-5-251128"
diff --git a/veadk/tools/builtin_tools/image_generate.py b/veadk/tools/builtin_tools/image_generate.py
@@ -83,7 +83,10 @@ def _build_input_parts(item: dict, task_type: str, image_field):
 
 
 def handle_single_task_sync(
-    idx: int, item: dict, tool_context
+    idx: int,
+    item: dict,
+    timeout: int,
+    tool_context,
 ) -> tuple[list[dict], list[str]]:
     logger.debug(f"handle_single_task_sync item {idx}: {item}")
     success_list: list[dict] = []
@@ -139,6 +142,7 @@ def handle_single_task_sync(
                             "MODEL_AGENT_CLIENT_REQ_ID", f"veadk/{VERSION}"
                         ),
                     },
+                    timeout=timeout,
                 )
             else:
                 response = client.images.generate(
@@ -152,6 +156,7 @@ def handle_single_task_sync(
                             "MODEL_AGENT_CLIENT_REQ_ID", f"veadk/{VERSION}"
                         ),
                     },
+                    timeout=timeout,
                 )
 
             if not response.error:
@@ -228,14 +233,16 @@ def handle_single_task_sync(
     return success_list, error_list
 
 
-async def image_generate(tasks: list[dict], tool_context) -> Dict:
-    """Generate images with Seedream 4.0.
+async def image_generate(tasks: list[dict], tool_context, timeout: int = 600) -> Dict:
+    """Generate images with Seedream 4.0 / 4.5
 
     Commit batch image generation requests via tasks.
 
     Args:
         tasks (list[dict]):
             A list of image-generation tasks. Each task is a dict.
+        timeout (int)
+            The timeout limit for the image generation task request, in seconds, with a default value of 600 seconds.
     Per-task schema
     ---------------
     Required:
@@ -336,7 +343,9 @@ async def image_generate(tasks: list[dict], tool_context) -> Dict:
 
         def make_task(idx, item):
             ctx = base_ctx.copy()
-            return lambda: ctx.run(handle_single_task_sync, idx, item, tool_context)
+            return lambda: ctx.run(
+                handle_single_task_sync, idx, item, timeout, tool_context
+            )
 
         loop = asyncio.get_event_loop()
         futures = [
diff --git a/veadk/tools/builtin_tools/video_generate.py b/veadk/tools/builtin_tools/video_generate.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import json
-import time
+import asyncio
 import traceback
 from typing import Dict, cast
 
@@ -41,14 +41,27 @@
 )
 
 
-async def generate(prompt, first_frame_image=None, last_frame_image=None):
+async def generate(
+    prompt, first_frame_image=None, last_frame_image=None, generate_audio=None
+):
     try:
+        if generate_audio is False:
+            generate_audio = None
+        model_name = getenv("MODEL_VIDEO_NAME", DEFAULT_VIDEO_MODEL_NAME)
+
+        if model_name.startswith("doubao-seedance-1-0") and generate_audio:
+            logger.warning(
+                "The `doubao-seedance-1-0` series models do not support enabling the audio field. "
+                "Please upgrade to the doubao-seedance-1-5 series of you want to generate video with audio."
+            )
+            generate_audio = None
         if first_frame_image is None:
             response = client.content_generation.tasks.create(
                 model=getenv("MODEL_VIDEO_NAME", DEFAULT_VIDEO_MODEL_NAME),
                 content=[
                     {"type": "text", "text": prompt},
                 ],
+                generate_audio=generate_audio,
                 extra_headers={
                     "veadk-source": "veadk",
                     "veadk-version": VERSION,
@@ -112,7 +125,10 @@ async def generate(prompt, first_frame_image=None, last_frame_image=None):
 
 
 async def video_generate(
-    params: list, tool_context: ToolContext, batch_size: int = 10
+    params: list,
+    tool_context: ToolContext,
+    batch_size: int = 10,
+    max_wait_seconds: int = 1200,
 ) -> Dict:
     """
     Generate videos in **batch** from text prompts, optionally guided by a first/last frame,
@@ -126,6 +142,10 @@ async def video_generate(
             A list of video generation requests. Each item supports the fields below.
         batch_size (int):
             The number of videos to generate in a batch. Defaults to 10.
+        max_wait_seconds (int):
+            Maximum time in seconds to wait for all video tasks in each batch.
+            Default is 20 minutes (1200 seconds). When the timeout is reached,
+            unfinished tasks will be marked as timeout errors.
 
             Required per item:
                 - video_name (str):
@@ -148,6 +168,12 @@ async def video_generate(
                     URL or Base64 string (data URL) for the **last frame** (role = `last_frame`).
                     Use when you want the clip to end on a specific image.
 
+                - generate_audio (bool | None):
+                    Boolean value, used to determine whether the generated video should have sound.
+                    If this field is not configured (None) or its value is `False`, no sound will be generated.
+                    If it is configured as `True`, sound can be generated.
+                    If you want to describe the sound content in detail, you can do so in the `prompt` field.
+
             Notes on first/last frame:
                 * When both frames are provided, **match width/height** to avoid cropping; if they differ,
                   the tail frame may be auto-cropped to fit.
@@ -222,6 +248,7 @@ async def video_generate(
     """
     success_list = []
     error_list = []
+    timeout_tasks = []
     logger.debug(f"Using model: {getenv('MODEL_VIDEO_NAME', DEFAULT_VIDEO_MODEL_NAME)}")
     logger.debug(f"video_generate params: {params}")
 
@@ -243,22 +270,32 @@ async def video_generate(
                 prompt = item["prompt"]
                 first_frame = item.get("first_frame", None)
                 last_frame = item.get("last_frame", None)
+                generate_audio = item.get("generate_audio", None)
                 try:
                     if not first_frame:
                         logger.debug(
                             f"video_generate task_{idx} text generation: prompt={prompt}"
                         )
-                        response = await generate(prompt)
+                        response = await generate(prompt, generate_audio=generate_audio)
                     elif not last_frame:
                         logger.debug(
                             f"video_generate task_{idx} first frame generation: prompt={prompt}, first_frame={first_frame}"
                         )
-                        response = await generate(prompt, first_frame)
+                        response = await generate(
+                            prompt,
+                            first_frame_image=first_frame,
+                            generate_audio=generate_audio,
+                        )
                     else:
                         logger.debug(
                             f"video_generate task_{idx} first and last frame generation: prompt={prompt}, first_frame={first_frame}, last_frame={last_frame}"
                         )
-                        response = await generate(prompt, first_frame, last_frame)
+                        response = await generate(
+                            prompt,
+                            first_frame_image=first_frame,
+                            last_frame_image=last_frame,
+                            generate_audio=generate_audio,
+                        )
                     logger.debug(
                         f"batch_{start_idx // batch_size} video_generate task_{idx} response: {response}"
                     )
@@ -270,6 +307,10 @@ async def video_generate(
 
             logger.debug("begin query video_generate task status...")
 
+            sleep_interval = 10
+            max_sleep_times = max_wait_seconds // sleep_interval
+            sleep_times = 0
+
             while True:
                 task_list = list(task_dict.keys())
                 if len(task_list) == 0:
@@ -303,7 +344,23 @@ async def video_generate(
                         logger.debug(
                             f"{task_dict[task_id]} video_generate current status: {status}, Retrying after 10 seconds..."
                         )
-                time.sleep(10)
+                if sleep_times >= max_sleep_times:
+                    logger.error(
+                        f"video_generate polling timed out after {max_wait_seconds} seconds; remaining tasks: {task_dict}"
+                    )
+                    for task_id, video_name in task_dict.items():
+                        timeout_tasks.append(
+                            {
+                                "task_id": task_id,
+                                "video_name": video_name,
+                            }
+                        )
+                        error_list.append(video_name)
+                    task_dict.clear()
+                    break
+
+                await asyncio.sleep(sleep_interval)
+                sleep_times += 1
 
             add_span_attributes(
                 span,
@@ -324,6 +381,7 @@ async def video_generate(
             "status": "error",
             "success_list": success_list,
             "error_list": error_list,
+            "timeout_tasks": timeout_tasks,
         }
     else:
         logger.debug(
@@ -333,6 +391,7 @@ async def video_generate(
             "status": "success",
             "success_list": success_list,
             "error_list": error_list,
+            "timeout_tasks": timeout_tasks,
         }