Skip to content

Commit c8d964a

Browse files
authored
Merge pull request #47 from kevinlin09/feat/aio_vl
feat(model/vl): support aio interface
2 parents b3e4f89 + c730001 commit c8d964a

File tree

5 files changed

+287
-4
lines changed

5 files changed

+287
-4
lines changed

dashscope/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from dashscope.aigc.conversation import Conversation, History, HistoryItem
88
from dashscope.aigc.generation import AioGeneration, Generation
99
from dashscope.aigc.image_synthesis import ImageSynthesis
10-
from dashscope.aigc.multimodal_conversation import MultiModalConversation
10+
from dashscope.aigc.multimodal_conversation import MultiModalConversation, AioMultiModalConversation
1111
from dashscope.aigc.video_synthesis import VideoSynthesis
1212
from dashscope.app.application import Application
1313
from dashscope.assistants import Assistant, AssistantList, Assistants
@@ -60,6 +60,7 @@
6060
MultiModalEmbeddingItemText,
6161
SpeechSynthesizer,
6262
MultiModalConversation,
63+
AioMultiModalConversation,
6364
BatchTextEmbedding,
6465
BatchTextEmbeddingResponse,
6566
Understanding,

dashscope/aigc/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from .conversation import Conversation, History, HistoryItem
44
from .generation import Generation
55
from .image_synthesis import ImageSynthesis
6-
from .multimodal_conversation import MultiModalConversation
6+
from .multimodal_conversation import MultiModalConversation, AioMultiModalConversation
77
from .video_synthesis import VideoSynthesis
88

99
__all__ = [
@@ -13,5 +13,6 @@
1313
History,
1414
ImageSynthesis,
1515
MultiModalConversation,
16+
AioMultiModalConversation,
1617
VideoSynthesis,
1718
]

dashscope/aigc/multimodal_conversation.py

Lines changed: 122 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
from dashscope.api_entities.dashscope_response import \
77
MultiModalConversationResponse
8-
from dashscope.client.base_api import BaseApi
8+
from dashscope.client.base_api import BaseAioApi, BaseApi
99
from dashscope.common.error import InputRequired, ModelRequired
1010
from dashscope.common.utils import _get_task_group_and_task
1111
from dashscope.utils.oss_utils import preprocess_message_element
@@ -130,3 +130,124 @@ def _preprocess_messages(cls, model: str, messages: List[dict],
130130
if is_upload and not has_upload:
131131
has_upload = True
132132
return has_upload
133+
134+
135+
class AioMultiModalConversation(BaseAioApi):
136+
"""Async MultiModal conversational robot interface.
137+
"""
138+
task = 'multimodal-generation'
139+
function = 'generation'
140+
141+
class Models:
142+
qwen_vl_chat_v1 = 'qwen-vl-chat-v1'
143+
144+
@classmethod
145+
async def call(
146+
cls,
147+
model: str,
148+
messages: List,
149+
api_key: str = None,
150+
workspace: str = None,
151+
**kwargs
152+
) -> Union[MultiModalConversationResponse, Generator[
153+
MultiModalConversationResponse, None, None]]:
154+
"""Call the conversation model service asynchronously.
155+
156+
Args:
157+
model (str): The requested model, such as 'qwen-multimodal-v1'
158+
messages (list): The generation messages.
159+
examples:
160+
[
161+
{
162+
"role": "system",
163+
"content": [
164+
{"text": "你是达摩院的生活助手机器人。"}
165+
]
166+
},
167+
{
168+
"role": "user",
169+
"content": [
170+
{"image": "http://XXXX"},
171+
{"text": "这个图片是哪里?"},
172+
]
173+
}
174+
]
175+
api_key (str, optional): The api api_key, can be None,
176+
if None, will retrieve by rule [1].
177+
[1]: https://help.aliyun.com/zh/dashscope/developer-reference/api-key-settings. # noqa E501
178+
workspace (str): The dashscope workspace id.
179+
**kwargs:
180+
stream(bool, `optional`): Enable server-sent events
181+
(ref: https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events) # noqa E501
182+
the result will back partially[qwen-turbo,bailian-v1].
183+
max_length(int, `optional`): The maximum length of tokens to
184+
generate. The token count of your prompt plus max_length
185+
cannot exceed the model's context length. Most models
186+
have a context length of 2000 tokens[qwen-turbo,bailian-v1]. # noqa E501
187+
top_p(float, `optional`): A sampling strategy, called nucleus
188+
sampling, where the model considers the results of the
189+
tokens with top_p probability mass. So 0.1 means only
190+
the tokens comprising the top 10% probability mass are
191+
considered[qwen-turbo,bailian-v1].
192+
top_k(float, `optional`):
193+
194+
Raises:
195+
InvalidInput: The history and auto_history are mutually exclusive.
196+
197+
Returns:
198+
Union[MultiModalConversationResponse,
199+
Generator[MultiModalConversationResponse, None, None]]: If
200+
stream is True, return Generator, otherwise MultiModalConversationResponse.
201+
"""
202+
if (messages is None or not messages):
203+
raise InputRequired('prompt or messages is required!')
204+
if model is None or not model:
205+
raise ModelRequired('Model is required!')
206+
task_group, _ = _get_task_group_and_task(__name__)
207+
msg_copy = copy.deepcopy(messages)
208+
has_upload = cls._preprocess_messages(model, msg_copy, api_key)
209+
if has_upload:
210+
headers = kwargs.pop('headers', {})
211+
headers['X-DashScope-OssResourceResolve'] = 'enable'
212+
kwargs['headers'] = headers
213+
input = {'messages': msg_copy}
214+
response = await super().call(model=model,
215+
task_group=task_group,
216+
task=AioMultiModalConversation.task,
217+
function=AioMultiModalConversation.function,
218+
api_key=api_key,
219+
input=input,
220+
workspace=workspace,
221+
**kwargs)
222+
is_stream = kwargs.get('stream', False)
223+
if is_stream:
224+
return (MultiModalConversationResponse.from_api_response(rsp)
225+
async for rsp in response)
226+
else:
227+
return MultiModalConversationResponse.from_api_response(response)
228+
229+
@classmethod
230+
def _preprocess_messages(cls, model: str, messages: List[dict],
231+
api_key: str):
232+
"""
233+
messages = [
234+
{
235+
"role": "user",
236+
"content": [
237+
{"image": ""},
238+
{"text": ""},
239+
]
240+
}
241+
]
242+
"""
243+
has_upload = False
244+
for message in messages:
245+
content = message['content']
246+
for elem in content:
247+
if not isinstance(elem,
248+
(int, float, bool, str, bytes, bytearray)):
249+
is_upload = preprocess_message_element(
250+
model, elem, api_key)
251+
if is_upload and not has_upload:
252+
has_upload = True
253+
return has_upload

dashscope/utils/oss_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,6 @@ def check_and_upload(model, elem: dict, api_key):
188188
return has_upload
189189

190190

191-
def preprocess_message_element(model: str, elem: List[dict], api_key: str):
191+
def preprocess_message_element(model: str, elem: dict, api_key: str):
192192
is_upload = check_and_upload(model, elem, api_key)
193193
return is_upload
Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
import os
2+
import asyncio
3+
import dashscope
4+
from dashscope.aigc.multimodal_conversation import AioMultiModalConversation
5+
6+
async def test_aio_multimodal_conversation():
7+
"""Test async multimodal conversation API."""
8+
9+
messages = [
10+
{
11+
"role": "system",
12+
"content": [
13+
{"text": "You are a helpful assistant."}
14+
]
15+
},
16+
{
17+
"role": "user",
18+
"content": [
19+
{"image": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241022/emyrja/dog_and_girl.jpeg"},
20+
{"text": "图中描绘的是什么景象?"}
21+
]
22+
}
23+
]
24+
25+
# 使用异步方式调用
26+
response = await AioMultiModalConversation.call(
27+
api_key=os.getenv('DASHSCOPE_API_KEY'),
28+
model='qwen-vl-max-latest',
29+
messages=messages,
30+
enable_encryption=True,
31+
)
32+
33+
print("Response:", response.output.choices[0].message.content[0]["text"])
34+
35+
async def test_aio_multimodal_conversation_stream():
36+
"""Test async multimodal conversation API with streaming."""
37+
38+
messages = [
39+
{
40+
"role": "system",
41+
"content": [
42+
{"text": "You are a helpful assistant."}
43+
]
44+
},
45+
{
46+
"role": "user",
47+
"content": [
48+
{"image": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241022/emyrja/dog_and_girl.jpeg"},
49+
{"text": "请详细描述这张图片中的内容"}
50+
]
51+
}
52+
]
53+
54+
# 使用异步流式调用
55+
async for chunk in await AioMultiModalConversation.call(
56+
api_key=os.getenv('DASHSCOPE_API_KEY'),
57+
model='qwen-vl-max-latest',
58+
messages=messages,
59+
stream=True,
60+
incremental_output=True,
61+
enable_encryption=True,
62+
):
63+
if hasattr(chunk, 'output') and chunk.output and chunk.output.choices:
64+
content = chunk.output.choices[0].message.content
65+
if content and len(content) > 0 and "text" in content[0]:
66+
print(chunk.output.choices[0].message.content[0]["text"], end="", flush=True)
67+
print() # 换行
68+
69+
async def test_aio_multimodal_conversation_local_image():
70+
"""Test async multimodal conversation API with local image."""
71+
72+
# 使用本地图片文件
73+
messages = [
74+
{
75+
"role": "system",
76+
"content": [
77+
{"text": "You are a helpful assistant."}
78+
]
79+
},
80+
{
81+
"role": "user",
82+
"content": [
83+
{"image": "tests/data/bird.JPEG"}, # 使用测试数据中的图片
84+
{"text": "这张图片是什么?"}
85+
]
86+
}
87+
]
88+
89+
try:
90+
response = await AioMultiModalConversation.call(
91+
api_key=os.getenv('DASHSCOPE_API_KEY'),
92+
model='qwen-vl-max-latest',
93+
messages=messages,
94+
enable_encryption=True,
95+
)
96+
97+
print("Local image response:", response.output.choices[0].message.content[0]["text"])
98+
except Exception as e:
99+
print(f"Error with local image: {e}")
100+
101+
async def test_aio_multimodal_conversation_multiple_local_images():
102+
"""Test async multimodal conversation API with multiple local images."""
103+
104+
# 使用多个本地图片文件
105+
messages = [
106+
{
107+
"role": "system",
108+
"content": [
109+
{"text": "You are a helpful assistant."}
110+
]
111+
},
112+
{
113+
"role": "user",
114+
"content": [
115+
{"image": "tests/data/bird.JPEG"},
116+
{"image": "tests/data/dogs.jpg"},
117+
{"text": "请比较这两张图片的差异"}
118+
]
119+
}
120+
]
121+
122+
try:
123+
print("Starting multiple local images test...")
124+
response = await AioMultiModalConversation.call(
125+
api_key=os.getenv('DASHSCOPE_API_KEY'),
126+
model='qwen-vl-max-latest',
127+
messages=messages,
128+
enable_encryption=True,
129+
)
130+
131+
print("Multiple local images response:", response.output.choices[0].message.content[0]["text"])
132+
except Exception as e:
133+
print(f"Error with multiple local images: {e}")
134+
135+
async def main():
136+
"""Main function to run all tests."""
137+
print("Testing Async MultiModal Conversation API...")
138+
print("=" * 50)
139+
140+
# 测试基本异步调用
141+
print("\n1. Testing basic async call:")
142+
await test_aio_multimodal_conversation()
143+
144+
# 测试异步流式调用
145+
print("\n2. Testing async streaming call:")
146+
await test_aio_multimodal_conversation_stream()
147+
148+
# 测试本地图片
149+
print("\n3. Testing with local image:")
150+
await test_aio_multimodal_conversation_local_image()
151+
152+
# # 测试多个本地图片
153+
print("\n4. Testing with multiple local images:")
154+
await test_aio_multimodal_conversation_multiple_local_images()
155+
156+
print("\nAll tests completed!")
157+
158+
if __name__ == "__main__":
159+
# 运行异步测试
160+
asyncio.run(main())

0 commit comments

Comments
 (0)