Skip to content

Commit 96069d0

Browse files
committed
feat: add multimodal generation tools
1 parent f4d366f commit 96069d0

17 files changed

+1024
-17
lines changed

src/agentscope_runtime/tools/__init__.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,18 @@
6060
from .generations.image_generation_wan25 import (
6161
ImageGenerationWan25,
6262
)
63+
from .generations.image_generation_wan26 import (
64+
ImageGenerationWan26,
65+
)
66+
from .generations.async_image_to_video_wan26 import (
67+
ImageToVideoWan26Submit,
68+
)
69+
from .generations.async_text_to_video_wan26 import (
70+
TextToVideoWan26Submit,
71+
)
72+
from .generations.fetch_wan import (
73+
WanVideoFetch,
74+
)
6375

6476

6577
class McpServerMeta(BaseModel):
@@ -116,4 +128,42 @@ class McpServerMeta(BaseModel):
116128
instructions="基于通义千问大模型的语音合成服务,支持多种语言语音合成功能",
117129
components=[QwenTextToSpeech],
118130
),
131+
"modelstudio_wan_multimodal": McpServerMeta(
132+
instructions=(
133+
"通义万相(Wan)多模态生成统一服务,支持文本/图像/语音到图像或视频的多种AI生成能力,"
134+
"包括图像生成、编辑、风格迁移、文生视频、图生视频、数字人表演等。"
135+
"当前支持 anx-style-repaint-v1、wan2.1、wan2.2、wan2.5、wan2.6 模型版本"
136+
"(wan2.1 仅用于基础图像编辑,wanx-style-repaint-v1仅用于人体风格重绘),各版本能力如下:\n"
137+
"- 文本生成图像:wan2.2、wan2.5、wan2.6 均支持,优先使用 wan2.6(画质最优),其次 wan2.5\n"
138+
"- 图像编辑:wan2.1(基础)、wan2.5、wan2.6 支持,优先使用 wan2.6\n"
139+
"- 文本/图像生成视频:wan2.2、wan2.5、wan2.6 均支持,但能力逐代增强:\n"
140+
" - 视频时长:wan2.2 仅支持 5 秒;wan2.5 支持 5 或 10 秒;wan2.6 支持 5、10 或 15 秒\n" # noqa
141+
" - 音频能力:支持自动配音或传入自定义音频实现声画同步(仅 wan2.5 和 wan2.6 支持)\n"
142+
" - 多镜头叙事:可生成包含多个镜头的视频,并在切换时保持主体一致性(仅 wan2.6 支持)\n"
143+
"- 数字人生成(音频驱动人物视频):基于单张人物图像与音频,生成自然说话、唱歌或表演视频;"
144+
"支持肖像、半身或全身画面,不限画幅比例;由 wan2.2 提供基础支持,wan2.5/2.6 支持更高质量与音频同步\n"
145+
"注意:异步视频仅提交生成任务,需配合的 Fetch 工具获取结果。\n"
146+
"注意:不同任务对模型版本有严格依赖,请务必结合具体工具描述中的[模型版本]信息进行调用。"
147+
),
148+
components=[
149+
# 基于通义万相大模型的智能图像生成服务,提供高质量的图像处理和编辑功能
150+
ImageGeneration, # wan2.2-t2i 文生图
151+
ImageEdit, # wan2.1-edit 图生图
152+
ImageStyleRepaint, # wan2.2-repaint 图风格迁移
153+
# 基于通义万相大模型提供AI视频生成服务,支持文本到视频、图像到视频和语音到视频的多模态生成功能
154+
TextToVideoSubmit, # wan2.2-t2v 文生视频提交
155+
ImageToVideoSubmit, # wan2.2-i2v 图生视频提交
156+
SpeechToVideoSubmit, # wan2.2-s2v
157+
# 基于通义万相大模型2.5版本提供的图像和视频生成服务
158+
ImageGenerationWan25, # wan2.5 文生图
159+
ImageEditWan25, # wan2.5 图生图
160+
TextToVideoWan25Submit, # wan2.5 文生视频提交
161+
ImageToVideoWan25Submit, # wan2.5 图生视频提交
162+
# 基于通义万相2.6大模型的智能图像生成服务,提供高质量的图像处理和编辑功能
163+
ImageGenerationWan26, # wanx2.6-t2i 文生图
164+
ImageToVideoWan26Submit, # wan2.6-i2v 图生视频提交
165+
TextToVideoWan26Submit, # wan2.6-t2v 文生视频提交
166+
WanVideoFetch, # wan 所有异步视频任务结果查询
167+
],
168+
),
119169
}

src/agentscope_runtime/tools/generations/async_image_to_video.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,8 +101,9 @@ class ImageToVideoSubmit(
101101

102102
name: str = "modelstudio_image_to_video_submit_task"
103103
description: str = (
104-
"通义万相-图生视频模型的异步任务提交工具。根据首帧图像和文本提示词,生成时长为5秒的无声视频。"
105-
"同时支持特效模板,可添加“魔法悬浮”、“气球膨胀”等效果,适用于创意视频制作、娱乐特效展示等场景。"
104+
"[版本: wan2.2] 通义万相图生视频模型(wan2.2-i2v-flash)异步任务提交工具。基于单张首帧图像和文本提示,生成一段5秒的无声动态视频。\n" # noqa
105+
"支持分辨率:480P、720P、1080P;不支持音频(无声视频)。\n"
106+
"提供特效模板(如“魔法悬浮”、“气球膨胀”),适用于创意视频制作、娱乐特效展示等场景。\n"
106107
)
107108

108109
@trace(trace_type="AIGC", trace_name="image_to_video_submit")

src/agentscope_runtime/tools/generations/async_image_to_video_wan25.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -111,8 +111,9 @@ class ImageToVideoWan25Submit(
111111

112112
name: str = "modelstudio_image_to_video_wan25_submit_task"
113113
description: str = (
114-
"通义万相-图生视频模型的异步任务提交工具。根据首帧图像和文本提示词,生成时长为5秒的无声视频。"
115-
"同时支持特效模板,可添加“魔法悬浮”、“气球膨胀”等效果,适用于创意视频制作、娱乐特效展示等场景。"
114+
"[版本: wan2.5] 通义万相图生视频模型(wan2.5-i2v)异步提交工具。基于单张首帧图像和文本提示,生成一段流畅的有声视频。\n" # noqa
115+
"支持视频时长:5秒或10秒;分辨率:480P、720P、1080P;支持自动配音或传入自定义音频,实现音画同步。\n"
116+
"提供特效模板(如“魔法悬浮”、“气球膨胀”),适用于创意视频、娱乐特效等场景。\n"
116117
)
117118

118119
@trace(trace_type="AIGC", trace_name="image_to_video_wan25_submit")
Lines changed: 290 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,290 @@
1+
# -*- coding: utf-8 -*-
2+
# pylint:disable=abstract-method, deprecated-module, wrong-import-order
3+
4+
import os
5+
import uuid
6+
from http import HTTPStatus
7+
from typing import Any, Optional
8+
9+
from dashscope.aigc.video_synthesis import AioVideoSynthesis
10+
from mcp.server.fastmcp import Context
11+
from pydantic import BaseModel, Field
12+
13+
from ..base import Tool
14+
from ..utils.api_key_util import get_api_key, ApiNames
15+
from ...engine.tracing import trace, TracingUtil
16+
17+
18+
class ImageToVideoWan26SubmitInput(BaseModel):
19+
"""
20+
Input model for submitting an image-to-video task using wan2.6-i2v.
21+
"""
22+
23+
image_url: str = Field(
24+
...,
25+
description="首帧图像的公网可访问URL,支持 JPG/PNG 格式,Base64编码径",
26+
)
27+
prompt: Optional[str] = Field(
28+
default=None,
29+
description="正向提示词,描述希望视频中发生的动作或变化,例如“镜头缓慢推进,风吹动树叶”。",
30+
)
31+
negative_prompt: Optional[str] = Field(
32+
default=None,
33+
description="反向提示词,用于排除不希望出现的内容,例如“模糊、闪烁、变形、水印”。",
34+
)
35+
audio_url: Optional[str] = Field(
36+
default=None,
37+
description="自定义音频文件的公网URL。参数优先级:audio_url > audio。",
38+
)
39+
audio: Optional[bool] = Field(
40+
default=None,
41+
description="是否自动生成配音。仅在 audio_url 未提供时生效。",
42+
)
43+
template: Optional[str] = Field(
44+
default=None,
45+
description="视频特效模板,如:squish(解压捏捏)、flying(魔法悬浮)、carousel(时光木马)等。",
46+
)
47+
resolution: Optional[str] = Field(
48+
default=None,
49+
description="视频分辨率,可选值:'720P'、'1080P'。默认为 '1080P'。",
50+
)
51+
duration: Optional[int] = Field(
52+
default=None,
53+
description="视频时长(秒),可选值:5、10、15。默认为 5。",
54+
)
55+
prompt_extend: Optional[bool] = Field(
56+
default=None,
57+
description=" Prompt 智能改写。开启后可提升生成效果,并使 shot_type 生效,"
58+
"默认值为 true:开启智能改写。false:不开启智能改写。",
59+
)
60+
shot_type: Optional[str] = Field(
61+
default=None,
62+
description="镜头类型,仅在 prompt_extend=true 时生效。"
63+
"可选值:'single'(单镜头,默认)、'multi'(多镜头切换)。"
64+
"参数优先级高于 prompt 中的描述。",
65+
)
66+
watermark: Optional[bool] = Field(
67+
default=None,
68+
description="是否在视频中添加水印(如“AI生成”标识)。默认不添加。",
69+
)
70+
seed: Optional[int] = Field(
71+
default=None,
72+
description="随机种子,用于结果复现。",
73+
)
74+
ctx: Optional[Context] = Field(
75+
default=None,
76+
description="HTTP request context containing headers for mcp only, "
77+
"don't generate it",
78+
)
79+
80+
81+
class ImageToVideoWan26SubmitOutput(BaseModel):
82+
"""
83+
Output of the image-to-video task submission.
84+
"""
85+
86+
task_id: str = Field(
87+
title="Task ID",
88+
description="异步任务的唯一标识符。",
89+
)
90+
task_status: str = Field(
91+
title="Task Status",
92+
description="视频生成的任务状态,PENDING:任务排队中,RUNNING:任务处理中,SUCCEEDED:任务执行成功,"
93+
"FAILED:任务执行失败,CANCELED:任务取消成功,UNKNOWN:任务不存在或状态未知",
94+
)
95+
request_id: Optional[str] = Field(
96+
default=None,
97+
title="Request ID",
98+
description="本次请求的唯一ID,可用于日志追踪。",
99+
)
100+
101+
102+
class ImageToVideoWan26Submit(
103+
Tool[ImageToVideoWan26SubmitInput, ImageToVideoWan26SubmitOutput],
104+
):
105+
"""
106+
Submit an image-to-video generation task using the wan2.6-i2v model.
107+
"""
108+
109+
name: str = "modelstudio_image_to_video_wan26_submit_task"
110+
description: str = (
111+
"[版本: wan2.6] 通义万相图生视频模型(wan2.6-i2v)异步任务提交工具。基于单张首帧图像和文本提示,生成一段流畅的有声视频。\n" # noqa
112+
"支持视频时长:5秒、10秒或15秒;分辨率:720P、1080P;支持自动配音或传入自定义音频,实现音画同步。\n"
113+
"独家支持多镜头叙事:可生成包含多个镜头的视频,并在镜头切换时保持主体一致性。\n"
114+
"提供特效模板(如“魔法悬浮”、“气球膨胀”),适用于创意视频制作、娱乐特效展示等场景。\n"
115+
)
116+
117+
@trace(trace_type="AIGC", trace_name="image_to_video_wan26_submit")
118+
async def arun(
119+
self,
120+
args: ImageToVideoWan26SubmitInput,
121+
**kwargs: Any,
122+
) -> ImageToVideoWan26SubmitOutput:
123+
trace_event = kwargs.pop("trace_event", None)
124+
request_id = TracingUtil.get_request_id()
125+
126+
try:
127+
api_key = get_api_key(ApiNames.dashscope_api_key, **kwargs)
128+
except AssertionError as e:
129+
raise ValueError("Please set valid DASHSCOPE_API_KEY!") from e
130+
131+
model_name = kwargs.get(
132+
"model_name",
133+
os.getenv("IMAGE_TO_VIDEO_MODEL_NAME", "wan2.6-i2v"),
134+
)
135+
136+
# 构建 parameters(全部为可选参数)
137+
parameters = {}
138+
if args.audio is not None:
139+
parameters["audio"] = args.audio
140+
if args.resolution:
141+
parameters["resolution"] = args.resolution
142+
if args.duration is not None:
143+
parameters["duration"] = args.duration
144+
if args.prompt_extend is not None:
145+
parameters["prompt_extend"] = args.prompt_extend
146+
if args.shot_type:
147+
parameters["shot_type"] = args.shot_type
148+
if args.watermark is not None:
149+
parameters["watermark"] = args.watermark
150+
if args.seed is not None:
151+
parameters["seed"] = args.seed
152+
aio_video_synthesis = AioVideoSynthesis()
153+
154+
# ⚠️ 关键修正:DashScope SDK 要求使用 img_url,不是 input
155+
response = await aio_video_synthesis.async_call(
156+
model=model_name,
157+
api_key=api_key,
158+
img_url=args.image_url, # ✅ 正确参数名
159+
prompt=args.prompt,
160+
negative_prompt=args.negative_prompt,
161+
audio_url=args.audio_url,
162+
template=args.template,
163+
**parameters,
164+
)
165+
166+
if trace_event:
167+
trace_event.on_log(
168+
"",
169+
**{
170+
"step_suffix": "results",
171+
"payload": {
172+
"request_id": request_id,
173+
"submit_task": response,
174+
},
175+
},
176+
)
177+
178+
if (
179+
response.status_code != HTTPStatus.OK
180+
or not response.output
181+
or response.output.task_status in ["FAILED", "CANCELED"]
182+
):
183+
raise RuntimeError(
184+
f"Failed to submit image-to-video task: {response}",
185+
)
186+
187+
request_id = response.request_id or request_id or str(uuid.uuid4())
188+
189+
return ImageToVideoWan26SubmitOutput(
190+
request_id=request_id,
191+
task_id=response.output.task_id,
192+
task_status=response.output.task_status,
193+
)
194+
195+
196+
# ========== Fetch 部分保持不变(仅微调描述) ==========
197+
198+
199+
class ImageToVideoWan26FetchInput(BaseModel): # noqa
200+
task_id: str = Field(
201+
title="Task ID",
202+
description="要查询的视频生成任务ID。",
203+
)
204+
ctx: Optional[Context] = Field(
205+
default=None,
206+
description="HTTP request context containing headers for mcp only, "
207+
"don't generate it",
208+
)
209+
210+
211+
class ImageToVideoWan26FetchOutput(BaseModel):
212+
video_url: str = Field(
213+
title="Video URL",
214+
description="生成视频的公网可访问URL(MP4格式)。",
215+
)
216+
task_id: str = Field(
217+
title="Task ID",
218+
description="任务ID,与输入一致。",
219+
)
220+
task_status: str = Field(
221+
title="Task Status",
222+
description="任务最终状态,成功时为 SUCCEEDED。",
223+
)
224+
request_id: Optional[str] = Field(
225+
default=None,
226+
title="Request ID",
227+
description="请求ID,用于追踪。",
228+
)
229+
230+
231+
class ImageToVideoWan26Fetch(
232+
Tool[ImageToVideoWan26FetchInput, ImageToVideoWan26FetchOutput],
233+
):
234+
name: str = "modelstudio_image_to_video_wan26_fetch_result"
235+
description: str = (
236+
"查询通义万相 wan2.6-i2v 图生视频任务的结果。"
237+
"输入 Task ID,返回生成的视频 URL 及任务状态。"
238+
"请在提交任务后轮询此接口,直到任务状态变为 SUCCEEDED。"
239+
)
240+
241+
@trace(trace_type="AIGC", trace_name="image_to_video_wan26_fetch")
242+
async def arun(
243+
self,
244+
args: ImageToVideoWan26FetchInput,
245+
**kwargs: Any,
246+
) -> ImageToVideoWan26FetchOutput:
247+
trace_event = kwargs.pop("trace_event", None)
248+
request_id = TracingUtil.get_request_id()
249+
250+
try:
251+
api_key = get_api_key(ApiNames.dashscope_api_key, **kwargs)
252+
except AssertionError as e:
253+
raise ValueError("Please set valid DASHSCOPE_API_KEY!") from e
254+
255+
aio_video_synthesis = AioVideoSynthesis()
256+
257+
response = await aio_video_synthesis.fetch(
258+
api_key=api_key,
259+
task=args.task_id,
260+
)
261+
262+
if trace_event:
263+
trace_event.on_log(
264+
"",
265+
**{
266+
"step_suffix": "results",
267+
"payload": {
268+
"request_id": response.request_id,
269+
"fetch_result": response,
270+
},
271+
},
272+
)
273+
274+
if (
275+
response.status_code != HTTPStatus.OK
276+
or not response.output
277+
or response.output.task_status in ["FAILED", "CANCELED"]
278+
):
279+
raise RuntimeError(
280+
f"Failed to fetch image-to-video result: {response}",
281+
)
282+
283+
request_id = response.request_id or request_id or str(uuid.uuid4())
284+
285+
return ImageToVideoWan26FetchOutput(
286+
video_url=response.output.video_url,
287+
task_id=response.output.task_id,
288+
task_status=response.output.task_status,
289+
request_id=request_id,
290+
)

src/agentscope_runtime/tools/generations/async_speech_to_video.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -79,9 +79,9 @@ class SpeechToVideoSubmit(
7979

8080
name: str = "modelstudio_speech_to_video_submit_task"
8181
description: str = (
82-
"数字人wan2.2-s2v模型的异步任务提交工具。能基于单张图片和音频,生成动作自然的说话、"
83-
"唱歌或表演视频。通过输入的人声音频,驱动静态图片中的人物实现口型、表情和动作与音频同步。"
84-
"支持说话、唱歌、表演三种对口型场景,支持真人及卡通人物,提供480P、720P两档分辨率选项。"
82+
"[版本: wan2.2] 通义万相语音驱动视频模型(wan2.2-s2v)异步任务提交工具。基于单张人物图像和一段音频,生成动作自然的说话、唱歌或表演视频。\n" # noqa
83+
"支持肖像、半身或全身人物图像,不限画幅比例;视频时长固定为5秒,为有声视频(音频即输入源)。\n"
84+
"适用于数字人播报、虚拟表演等场景。\n"
8585
)
8686

8787
@staticmethod

0 commit comments

Comments
 (0)