Support wan2.6 video (#85)

mose-zm · mose-x.zm · web-flow · commit e4b943f7ee1c · 2025-12-16T19:48:07.000+08:00
* support wan2.6 video generation

* support wan2.6 video generation

* support wan2.6 video generation

---------

Co-authored-by: mose-x.zm &lt;zm02074348@alibaba-inc.com&gt;
diff --git a/dashscope/aigc/video_synthesis.py b/dashscope/aigc/video_synthesis.py
@@ -1,11 +1,11 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-from typing import Any, Dict, Union
+from typing import Any, Dict, Union, List
 
 from dashscope.api_entities.dashscope_response import (DashScopeAPIResponse,
                                                        VideoSynthesisResponse)
 from dashscope.client.base_api import BaseAsyncApi, BaseAsyncAioApi
-from dashscope.common.constants import PROMPT
+from dashscope.common.constants import PROMPT, REFERENCE_VIDEO_URLS
 from dashscope.common.utils import _get_task_group_and_task
 from dashscope.utils.oss_utils import check_and_upload_local
 
@@ -39,6 +39,8 @@ def call(cls,
              template: str = None,
              img_url: str = None,
              audio_url: str = None,
+             reference_video_urls: List[str] = None,
+             reference_video_description: List[str] = None,
              api_key: str = None,
              extra_input: Dict = None,
              workspace: str = None,
@@ -58,6 +60,8 @@ def call(cls,
             template (str): LoRa input, such as gufeng, katong, etc.
             img_url (str): The input image url, Generate the URL of the image referenced by the video.
             audio_url (str): The input audio url
+            reference_video_urls (List[str]): list of character reference video file urls uploaded by the user
+            reference_video_description (List[str]): For the description information of the picture and sound of the reference video, corresponding to ref video, it needs to be in the order of the url. If the quantity is different, an error will be reported
             api_key (str, optional): The api api_key. Defaults to None.
             workspace (str): The dashscope workspace id.
             extra_input (Dict): The extra input parameters.
@@ -79,6 +83,8 @@ def call(cls,
                             prompt,
                             img_url=img_url,
                             audio_url=audio_url,
+                            reference_video_urls=reference_video_urls,
+                            reference_video_description=reference_video_description,
                             api_key=api_key,
                             extend_prompt=extend_prompt,
                             negative_prompt=negative_prompt,
@@ -98,6 +104,8 @@ def _get_input(cls,
                    prompt: Any = None,
                    img_url: str = None,
                    audio_url: str = None,
+                   reference_video_urls: List[str] = None,
+                   reference_video_description: List[str] = None,
                    # """@deprecated, use prompt_extend in parameters """
                    extend_prompt: bool = True,
                    negative_prompt: str = None,
@@ -119,6 +127,8 @@ def _get_input(cls,
             inputs['template'] = template
         if function:
             inputs['function'] = function
+        if reference_video_description:
+            inputs['reference_video_description'] = reference_video_description
 
         has_upload = False
         upload_certificate = None
@@ -165,6 +175,17 @@ def _get_input(cls,
                 has_upload = True
             inputs['last_frame_url'] = res_last_frame_url
 
+        if (reference_video_urls is not None
+                and reference_video_urls and len(reference_video_urls) > 0):
+            new_videos = []
+            for video in reference_video_urls:
+                is_upload, new_video, upload_certificate = check_and_upload_local(
+                    model, video, api_key, upload_certificate)
+                if is_upload:
+                    has_upload = True
+                new_videos.append(new_video)
+            inputs[REFERENCE_VIDEO_URLS] = new_videos
+
         if extra_input is not None and extra_input:
             inputs = {**inputs, **extra_input}
         if has_upload:
@@ -185,6 +206,8 @@ def async_call(cls,
                    prompt: Any = None,
                    img_url: str = None,
                    audio_url: str = None,
+                   reference_video_urls: List[str] = None,
+                   reference_video_description: List[str] = None,
                    # """@deprecated, use prompt_extend in parameters """
                    extend_prompt: bool = True,
                    negative_prompt: str = None,
@@ -208,6 +231,8 @@ def async_call(cls,
             template (str): LoRa input, such as gufeng, katong, etc.
             img_url (str): The input image url, Generate the URL of the image referenced by the video.
             audio_url (str): The input audio url.
+            reference_video_urls (List[str]): list of character reference video file urls uploaded by the user
+            reference_video_description (List[str]): For the description information of the picture and sound of the reference video, corresponding to ref video, it needs to be in the order of the url. If the quantity is different, an error will be reported
             api_key (str, optional): The api api_key. Defaults to None.
             workspace (str): The dashscope workspace id.
             extra_input (Dict): The extra input parameters.
@@ -229,7 +254,8 @@ def async_call(cls,
         task_group, function = _get_task_group_and_task(__name__)
 
         inputs, kwargs, task = cls._get_input(
-            model, prompt, img_url, audio_url, extend_prompt, negative_prompt, template, api_key,
+            model, prompt, img_url, audio_url, reference_video_urls, reference_video_description,
+            extend_prompt, negative_prompt, template, api_key,
             extra_input, task, function, head_frame, tail_frame,
             first_frame_url, last_frame_url, **kwargs)
 
@@ -354,6 +380,8 @@ async def call(cls,
                    prompt: Any = None,
                    img_url: str = None,
                    audio_url: str = None,
+                   reference_video_urls: List[str] = None,
+                   reference_video_description: List[str] = None,
                    # """@deprecated, use prompt_extend in parameters """
                    extend_prompt: bool = True,
                    negative_prompt: str = None,
@@ -377,6 +405,8 @@ async def call(cls,
             template (str): LoRa input, such as gufeng, katong, etc.
             img_url (str): The input image url, Generate the URL of the image referenced by the video.
             audio_url (str): The input audio url.
+            reference_video_urls (List[str]): list of character reference video file urls uploaded by the user
+            reference_video_description (List[str]): For the description information of the picture and sound of the reference video, corresponding to ref video, it needs to be in the order of the url. If the quantity is different, an error will be reported
             api_key (str, optional): The api api_key. Defaults to None.
             workspace (str): The dashscope workspace id.
             extra_input (Dict): The extra input parameters.
@@ -396,7 +426,8 @@ async def call(cls,
         """
         task_group, f = _get_task_group_and_task(__name__)
         inputs, kwargs, task = VideoSynthesis._get_input(
-            model, prompt, img_url, audio_url, extend_prompt, negative_prompt, template, api_key,
+            model, prompt, img_url, audio_url, reference_video_urls, reference_video_description,
+            extend_prompt, negative_prompt, template, api_key,
             extra_input, task, f, head_frame, tail_frame,
             first_frame_url, last_frame_url, **kwargs)
         response = await super().call(model, inputs, task_group, task, f, api_key, workspace, **kwargs)
@@ -408,6 +439,8 @@ async def async_call(cls,
                    prompt: Any = None,
                    img_url: str = None,
                    audio_url: str = None,
+                   reference_video_urls: List[str] = None,
+                   reference_video_description: List[str] = None,
                    # """@deprecated, use prompt_extend in parameters """
                    extend_prompt: bool = True,
                    negative_prompt: str = None,
@@ -431,6 +464,8 @@ async def async_call(cls,
             template (str): LoRa input, such as gufeng, katong, etc.
             img_url (str): The input image url, Generate the URL of the image referenced by the video.
             audio_url (str): The input audio url.
+            reference_video_urls (List[str]): list of character reference video file urls uploaded by the user
+            reference_video_description (List[str]): For the description information of the picture and sound of the reference video, corresponding to ref video, it needs to be in the order of the url. If the quantity is different, an error will be reported
             api_key (str, optional): The api api_key. Defaults to None.
             workspace (str): The dashscope workspace id.
             extra_input (Dict): The extra input parameters.
@@ -452,7 +487,8 @@ async def async_call(cls,
         task_group, function = _get_task_group_and_task(__name__)
 
         inputs, kwargs, task = VideoSynthesis._get_input(
-            model, prompt, img_url, audio_url, extend_prompt, negative_prompt, template, api_key,
+            model, prompt, img_url, audio_url, reference_video_urls, reference_video_description,
+            extend_prompt, negative_prompt, template, api_key,
             extra_input, task, function, head_frame, tail_frame,
             first_frame_url, last_frame_url, **kwargs)
 
diff --git a/dashscope/common/constants.py b/dashscope/common/constants.py
@@ -25,6 +25,7 @@
 HISTORY = 'history'
 CUSTOMIZED_MODEL_ID = 'customized_model_id'
 IMAGES = 'images'
+REFERENCE_VIDEO_URLS = 'reference_video_urls'
 TEXT_EMBEDDING_INPUT_KEY = 'texts'
 SERVICE_503_MESSAGE = 'Service temporarily unavailable, possibly overloaded or not ready.'  # noqa E501
 WEBSOCKET_ERROR_CODE = 44
diff --git a/samples/test_video_synthesis.py b/samples/test_video_synthesis.py
@@ -2,17 +2,21 @@
 from dashscope import VideoSynthesis
 import os
 
-prompt = "一幅史诗级可爱的场景。一只小巧可爱的卡通小猫将军，身穿细节精致的金色盔甲，头戴一个稍大的头盔，勇敢地站在悬崖上。他骑着一匹虽小但英勇的战马。悬崖下方，一支由老鼠组成的、数量庞大、无穷无尽的军队正带着临时制作的武器向前冲锋。这是一个戏剧性的、大规模的战斗场景，灵感来自中国古代的战争史诗。远处的雪山上空，天空乌云密布。整体氛围是“可爱”与“霸气”的搞笑和史诗般的融合"
+prompt = "一只小猫在月光下奔跑"
 audio_url = 'https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20250925/ozwpvi/rap.mp3'
+reference_video_urls = ["https://test-data-center.oss-accelerate.aliyuncs.com/wanx/video/resources/with_human_voice_11s.mov"]
 api_key = os.getenv("DASHSCOPE_API_KEY")
 
 
 def simple_call():
     print('----sync call, please wait a moment----')
     rsp = VideoSynthesis.call(api_key=api_key,
-                              model="wan2.5-t2v-preview",
-                              prompt=prompt,
-                              audio_url=audio_url)
+                              model="wan2.6-r2v",
+                              reference_video_urls=reference_video_urls,
+                              shot_type="multi",
+                              audio=True,
+                              watermark=True,
+                              prompt=prompt)
     if rsp.status_code == HTTPStatus.OK:
 
         print('response: %s' % rsp)