Merge pull request #47 from kevinlin09/feat/aio_vl

kevinlin09 · web-flow · commit c8d964aff852 · 2025-08-26T11:51:01.000+08:00
feat(model/vl): support aio interface
diff --git a/dashscope/__init__.py b/dashscope/__init__.py
@@ -7,7 +7,7 @@
 from dashscope.aigc.conversation import Conversation, History, HistoryItem
 from dashscope.aigc.generation import AioGeneration, Generation
 from dashscope.aigc.image_synthesis import ImageSynthesis
-from dashscope.aigc.multimodal_conversation import MultiModalConversation
+from dashscope.aigc.multimodal_conversation import MultiModalConversation, AioMultiModalConversation
 from dashscope.aigc.video_synthesis import VideoSynthesis
 from dashscope.app.application import Application
 from dashscope.assistants import Assistant, AssistantList, Assistants
@@ -60,6 +60,7 @@
     MultiModalEmbeddingItemText,
     SpeechSynthesizer,
     MultiModalConversation,
+    AioMultiModalConversation,
     BatchTextEmbedding,
     BatchTextEmbeddingResponse,
     Understanding,
diff --git a/dashscope/aigc/__init__.py b/dashscope/aigc/__init__.py
@@ -3,7 +3,7 @@
 from .conversation import Conversation, History, HistoryItem
 from .generation import Generation
 from .image_synthesis import ImageSynthesis
-from .multimodal_conversation import MultiModalConversation
+from .multimodal_conversation import MultiModalConversation, AioMultiModalConversation
 from .video_synthesis import VideoSynthesis
 
 __all__ = [
@@ -13,5 +13,6 @@
     History,
     ImageSynthesis,
     MultiModalConversation,
+    AioMultiModalConversation,
     VideoSynthesis,
 ]
diff --git a/dashscope/aigc/multimodal_conversation.py b/dashscope/aigc/multimodal_conversation.py
@@ -5,7 +5,7 @@
 
 from dashscope.api_entities.dashscope_response import \
     MultiModalConversationResponse
-from dashscope.client.base_api import BaseApi
+from dashscope.client.base_api import BaseAioApi, BaseApi
 from dashscope.common.error import InputRequired, ModelRequired
 from dashscope.common.utils import _get_task_group_and_task
 from dashscope.utils.oss_utils import preprocess_message_element
@@ -130,3 +130,124 @@ def _preprocess_messages(cls, model: str, messages: List[dict],
                     if is_upload and not has_upload:
                         has_upload = True
         return has_upload
+
+
+class AioMultiModalConversation(BaseAioApi):
+    """Async MultiModal conversational robot interface.
+    """
+    task = 'multimodal-generation'
+    function = 'generation'
+
+    class Models:
+        qwen_vl_chat_v1 = 'qwen-vl-chat-v1'
+
+    @classmethod
+    async def call(
+        cls,
+        model: str,
+        messages: List,
+        api_key: str = None,
+        workspace: str = None,
+        **kwargs
+    ) -> Union[MultiModalConversationResponse, Generator[
+            MultiModalConversationResponse, None, None]]:
+        """Call the conversation model service asynchronously.
+
+        Args:
+            model (str): The requested model, such as 'qwen-multimodal-v1'
+            messages (list): The generation messages.
+                examples:
+                    [
+                        {
+                            "role": "system",
+                            "content": [
+                                {"text": "你是达摩院的生活助手机器人。"}
+                            ]
+                        },
+                        {
+                            "role": "user",
+                            "content": [
+                                {"image": "http://XXXX"},
+                                {"text": "这个图片是哪里？"},
+                            ]
+                        }
+                    ]
+            api_key (str, optional): The api api_key, can be None,
+                if None, will retrieve by rule [1].
+                [1]: https://help.aliyun.com/zh/dashscope/developer-reference/api-key-settings. # noqa E501
+            workspace (str): The dashscope workspace id.
+            **kwargs:
+                stream(bool, `optional`): Enable server-sent events
+                    (ref: https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events)  # noqa E501
+                    the result will back partially[qwen-turbo,bailian-v1].
+                max_length(int, `optional`): The maximum length of tokens to
+                    generate. The token count of your prompt plus max_length
+                    cannot exceed the model's context length. Most models
+                    have a context length of 2000 tokens[qwen-turbo,bailian-v1]. # noqa E501
+                top_p(float, `optional`): A sampling strategy, called nucleus
+                    sampling, where the model considers the results of the
+                    tokens with top_p probability mass. So 0.1 means only
+                    the tokens comprising the top 10% probability mass are
+                    considered[qwen-turbo,bailian-v1].
+                top_k(float, `optional`):
+
+        Raises:
+            InvalidInput: The history and auto_history are mutually exclusive.
+
+        Returns:
+            Union[MultiModalConversationResponse,
+                  Generator[MultiModalConversationResponse, None, None]]: If
+            stream is True, return Generator, otherwise MultiModalConversationResponse.
+        """
+        if (messages is None or not messages):
+            raise InputRequired('prompt or messages is required!')
+        if model is None or not model:
+            raise ModelRequired('Model is required!')
+        task_group, _ = _get_task_group_and_task(__name__)
+        msg_copy = copy.deepcopy(messages)
+        has_upload = cls._preprocess_messages(model, msg_copy, api_key)
+        if has_upload:
+            headers = kwargs.pop('headers', {})
+            headers['X-DashScope-OssResourceResolve'] = 'enable'
+            kwargs['headers'] = headers
+        input = {'messages': msg_copy}
+        response = await super().call(model=model,
+                                      task_group=task_group,
+                                      task=AioMultiModalConversation.task,
+                                      function=AioMultiModalConversation.function,
+                                      api_key=api_key,
+                                      input=input,
+                                      workspace=workspace,
+                                      **kwargs)
+        is_stream = kwargs.get('stream', False)
+        if is_stream:
+            return (MultiModalConversationResponse.from_api_response(rsp)
+                    async for rsp in response)
+        else:
+            return MultiModalConversationResponse.from_api_response(response)
+
+    @classmethod
+    def _preprocess_messages(cls, model: str, messages: List[dict],
+                             api_key: str):
+        """
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {"image": ""},
+                        {"text": ""},
+                    ]
+                }
+            ]
+        """
+        has_upload = False
+        for message in messages:
+            content = message['content']
+            for elem in content:
+                if not isinstance(elem,
+                                  (int, float, bool, str, bytes, bytearray)):
+                    is_upload = preprocess_message_element(
+                        model, elem, api_key)
+                    if is_upload and not has_upload:
+                        has_upload = True
+        return has_upload
diff --git a/dashscope/utils/oss_utils.py b/dashscope/utils/oss_utils.py
@@ -188,6 +188,6 @@ def check_and_upload(model, elem: dict, api_key):
     return has_upload
 
 
-def preprocess_message_element(model: str, elem: List[dict], api_key: str):
+def preprocess_message_element(model: str, elem: dict, api_key: str):
     is_upload = check_and_upload(model, elem, api_key)
     return is_upload
diff --git a/samples/test_aio_multimodal_conversation.py b/samples/test_aio_multimodal_conversation.py
@@ -0,0 +1,160 @@
+import os
+import asyncio
+import dashscope
+from dashscope.aigc.multimodal_conversation import AioMultiModalConversation
+
+async def test_aio_multimodal_conversation():
+    """Test async multimodal conversation API."""
+    
+    messages = [
+        {
+            "role": "system",
+            "content": [
+                {"text": "You are a helpful assistant."}
+            ]
+        },
+        {
+            "role": "user",
+            "content": [
+                {"image": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241022/emyrja/dog_and_girl.jpeg"},
+                {"text": "图中描绘的是什么景象?"}
+            ]
+        }
+    ]
+    
+    # 使用异步方式调用
+    response = await AioMultiModalConversation.call(
+        api_key=os.getenv('DASHSCOPE_API_KEY'),
+        model='qwen-vl-max-latest',
+        messages=messages,
+        enable_encryption=True,
+    )
+    
+    print("Response:", response.output.choices[0].message.content[0]["text"])
+
+async def test_aio_multimodal_conversation_stream():
+    """Test async multimodal conversation API with streaming."""
+    
+    messages = [
+        {
+            "role": "system",
+            "content": [
+                {"text": "You are a helpful assistant."}
+            ]
+        },
+        {
+            "role": "user",
+            "content": [
+                {"image": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241022/emyrja/dog_and_girl.jpeg"},
+                {"text": "请详细描述这张图片中的内容"}
+            ]
+        }
+    ]
+    
+    # 使用异步流式调用
+    async for chunk in await AioMultiModalConversation.call(
+        api_key=os.getenv('DASHSCOPE_API_KEY'),
+        model='qwen-vl-max-latest',
+        messages=messages,
+        stream=True,
+        incremental_output=True,
+        enable_encryption=True,
+    ):
+        if hasattr(chunk, 'output') and chunk.output and chunk.output.choices:
+            content = chunk.output.choices[0].message.content
+            if content and len(content) > 0 and "text" in content[0]:
+                print(chunk.output.choices[0].message.content[0]["text"], end="", flush=True)
+    print()  # 换行
+
+async def test_aio_multimodal_conversation_local_image():
+    """Test async multimodal conversation API with local image."""
+    
+    # 使用本地图片文件
+    messages = [
+        {
+            "role": "system",
+            "content": [
+                {"text": "You are a helpful assistant."}
+            ]
+        },
+        {
+            "role": "user",
+            "content": [
+                {"image": "tests/data/bird.JPEG"},  # 使用测试数据中的图片
+                {"text": "这张图片是什么?"}
+            ]
+        }
+    ]
+    
+    try:
+        response = await AioMultiModalConversation.call(
+            api_key=os.getenv('DASHSCOPE_API_KEY'),
+            model='qwen-vl-max-latest',
+            messages=messages,
+            enable_encryption=True,
+        )
+        
+        print("Local image response:", response.output.choices[0].message.content[0]["text"])
+    except Exception as e:
+        print(f"Error with local image: {e}")
+
+async def test_aio_multimodal_conversation_multiple_local_images():
+    """Test async multimodal conversation API with multiple local images."""
+    
+    # 使用多个本地图片文件
+    messages = [
+        {
+            "role": "system",
+            "content": [
+                {"text": "You are a helpful assistant."}
+            ]
+        },
+        {
+            "role": "user",
+            "content": [
+                {"image": "tests/data/bird.JPEG"},
+                {"image": "tests/data/dogs.jpg"},
+                {"text": "请比较这两张图片的差异"}
+            ]
+        }
+    ]
+    
+    try:
+        print("Starting multiple local images test...")
+        response = await AioMultiModalConversation.call(
+            api_key=os.getenv('DASHSCOPE_API_KEY'),
+            model='qwen-vl-max-latest',
+            messages=messages,
+            enable_encryption=True,
+        )
+        
+        print("Multiple local images response:", response.output.choices[0].message.content[0]["text"])
+    except Exception as e:
+        print(f"Error with multiple local images: {e}")
+
+async def main():
+    """Main function to run all tests."""
+    print("Testing Async MultiModal Conversation API...")
+    print("=" * 50)
+    
+    # 测试基本异步调用
+    print("\n1. Testing basic async call:")
+    await test_aio_multimodal_conversation()
+    
+    # 测试异步流式调用
+    print("\n2. Testing async streaming call:")
+    await test_aio_multimodal_conversation_stream()
+    
+    # 测试本地图片
+    print("\n3. Testing with local image:")
+    await test_aio_multimodal_conversation_local_image()
+    
+    # # 测试多个本地图片
+    print("\n4. Testing with multiple local images:")
+    await test_aio_multimodal_conversation_multiple_local_images()
+    
+    print("\nAll tests completed!")
+
+if __name__ == "__main__":
+    # 运行异步测试
+    asyncio.run(main())