✨ image to text tool

Zhi-a · Zhi-a · commit 768472759545 · 2025-11-25T15:27:45.000+08:00
diff --git a/backend/agents/create_agent_info.py b/backend/agents/create_agent_info.py
@@ -24,7 +24,6 @@
 from utils.prompt_template_utils import get_agent_prompt_template
 from utils.config_utils import tenant_config_manager, get_model_name_from_config
 from consts.const import LOCAL_MCP_SERVER, MODEL_CONFIG_MAPPING, LANGUAGE
-
 from backend.database.client import minio_client
 from backend.services.image_service import get_vlm_model
 
diff --git a/backend/mcp_service.py b/backend/mcp_service.py
@@ -1,5 +1,4 @@
 import logging
-
 from utils.logging_utils import configure_logging
 from fastmcp import FastMCP
 from tool_collection.mcp.local_mcp_service import local_mcp_service
diff --git a/backend/services/image_service.py b/backend/services/image_service.py
@@ -4,12 +4,12 @@
 import aiohttp
 
 from consts.const import DATA_PROCESS_SERVICE
+from consts.const import MODEL_CONFIG_MAPPING
+from utils.config_utils import tenant_config_manager, get_model_name_from_config
+
 from nexent import MessageObserver
 from nexent.core.models import OpenAIVLModel
 
-from backend.consts.const import MODEL_CONFIG_MAPPING
-from backend.utils.config_utils import tenant_config_manager, get_model_name_from_config
-
 logger = logging.getLogger("image_service")
 
 
diff --git a/backend/services/tool_configuration_service.py b/backend/services/tool_configuration_service.py
@@ -25,9 +25,8 @@
 from database.user_tenant_db import get_all_tenant_ids
 from services.vectordatabase_service import get_embedding_model, get_vector_db_core
 from services.tenant_config_service import get_selected_knowledge_list
-
-from backend.database.client import minio_client
-from backend.services.image_service import get_vlm_model
+from database.client import minio_client
+from services.image_service import get_vlm_model
 
 logger = logging.getLogger("tool_configuration_service")
 
diff --git a/doc/docs/zh/opensource-memorial-wall.md b/doc/docs/zh/opensource-memorial-wall.md
@@ -16,26 +16,6 @@
 请保持消息的礼貌和尊重，符合我们的行为准则。
 -->
 
-::: info happyzhang - 2025-11-13
-也许我们正见证着未来的“后起之秀”😀
-:::
-
-::: info KevinLeeNJ - 2025-11-13
-来参加华为ICT大赛的，nexent很不错，希望后续能有更多功能！
-:::
-
-::: info lzysleep - 2025-11-7
-非常不错的项目，很适合快速上手搭建自己的Agent，赞赞赞！
-:::
-
-::: info fishcat - 2025-10-31
-很好的项目，希望蒸蒸日上
-:::
-
-::: tip xiaomi250 - 2025-10-18
-打算冲一波 ICT 大赛！正好借着这个机会多捣鼓捣鼓，把我的技术再升个级，想想还挺有意思的～
-:::
-
 ::: tip aibito - 某创业公司后端开发 - 2025-05-18
 我们是一家只有 15 人的小公司，之前一直想做智能客服但技术门槛太高。发现 Nexent 后如获至宝！20+ 文件格式支持让我们轻松处理用户上传的各种文档，多模态对话功能完美解决了语音客服需求。最重要的是，我们的产品经理现在也能直接用自然语言调整智能体逻辑，开发效率提升了好几倍！
 :::
@@ -180,6 +160,10 @@ Nexent的自然语言生成Agent以及多智能体协同是我一直在研究的
 第一次接触智能体编排，是为了参加华为ICT大赛而了解 Nexent 的。  没想到入门比想象中容易，文档也写得很清晰。  
 :::
 
+::: tip xiaomi250 - 2025-10-18
+打算冲一波 ICT 大赛！正好借着这个机会多捣鼓捣鼓，把我的技术再升个级，想想还挺有意思的～
+:::
+
 ::: tip YuXiaoLoong - 2025-10-27
 Nexent是一个十分便利的开发平台，文档清晰，工具齐全，有幸能用上这么好用的平台，希望能在这个平台上学到更多技术和思想。
 :::
@@ -238,6 +222,8 @@ Nexent功能如此之强大，给我很多帮助，感谢开发者！厉害
 
 ::: info y-dq - 2025-10-28 
 想要自己尝试搭建智能体，感叹Nexent的功能如此强大！
+:::
+
 ::: tip cai7777 - 2025-10 23
 参加ICT大赛来了解 Nexent
 :::
@@ -314,6 +300,10 @@ Nexent功能如此之强大，给我很多帮助，感谢开发者！厉害
 感谢 Nexent 让我踏上了开源之旅!希望能参加ict大赛长长见识。项目不错，给个star~ 
 :::
 
+::: info fishcat - 2025-10-31
+很好的项目，希望蒸蒸日上
+:::
+
 ::：info XxHosxX - 2025-11-5
 希望参与ICT大赛以及Nexent平台提升自己的能力:)
 :::
@@ -330,6 +320,10 @@ Nexent功能如此之强大，给我很多帮助，感谢开发者！厉害
 期待能使用Nexent成为智能体开发大佬
 :::
 
+::: info lzysleep - 2025-11-7
+非常不错的项目，很适合快速上手搭建自己的Agent，赞赞赞！
+:::
+
 ::: info xiaochenIpter - 2025-11-08
 希望能参加ict大赛可以学习到更多知识,感谢 Nexent 让我踏上了开源之旅！平台开发智能体的能力十分强大，希望能够学习到更多东西！
 :::
@@ -398,6 +392,14 @@ Nexent功能如此之强大，给我很多帮助，感谢开发者！厉害
 我又来了，通过华为ICT了解到nexent，正在学习中...
 :::
 
+::: info happyzhang - 2025-11-13
+也许我们正见证着未来的“后起之秀”😀
+:::
+
+::: info KevinLeeNJ - 2025-11-13
+来参加华为ICT大赛的，nexent很不错，希望后续能有更多功能！
+:::
+
 ::: info user - 2025-11-14
 我要参加华为ICT
 :::
@@ -434,3 +436,66 @@ Nexent功能如此之强大，给我很多帮助，感谢开发者！厉害
 感谢 Nexent 让我踏上了开源之旅！给我一个机会制作智能体
 :::
 
+::: info 开源小白 - 2025-11-19
+感谢 Nexent 让我踏上了开源之旅！这个项目的文档真的很棒，帮助我快速上手。
+:::
+
+::: info chengyudan - 2025-10-20 
+感谢 Nexent 让我踏上了开源之旅！ 
+:::
+
+::: info user - 2025-11-20
+学习ai - agent非常好的项目，后面会持续输出贡献！
+:::
+
+::: china-king-hs - 2025-11-20
+希望能正常使用nexent
+:::
+
+::: info user - 2025-11-22
+感谢nexent这个开源项目
+:::
+
+::: tip xiaofu-2025-11-23
+xiaofu到此一游，感谢 Nexent 让我踏上了开源之旅！
+:::
+
+::: info DUTBenjamin - 2025-11-23
+来参加华为ICT大赛的,正好借着这个机会多捣鼓捣鼓,学到更多东西，加油！
+:::
+
+::: dean-stock - 2025-11-23
+感谢nexent让我第一次接触到了智能体，让我从使用到创作智能体的转变。
+:::
+
+::: user - 2025-11-23
+学习到ai了，很好用
+:::
+
+::: info chao - 2025-11-23
+使用 Nexent 开发了项目，MCP 工具集成特别强大，节省了大量开发时间！
+:::
+
+::: adasibi - 2025-11-23
+学习ai很好用，感谢 Nexent 让我踏上了开源之旅！
+:::
+
+::: user - 2025-11-23
+Nexent越来越好！
+:::
+
+::: info DUTBenjamin - 2025-11-23
+来参加华为ICT大赛的,正好借着这个机会学到更多东西，加油！
+:::
+
+::: info aurorahashcat - 2025-11-23
+nexent看起来超棒的自动化智能体构建平台，祝越来越好😀
+:::
+
+::: williamllk from SJTU - 2025-11-23
+感谢 Nexent 让我第一次制作智能体，尝试将AI4Science的理念付诸实践
+:::
+
+::: tip lostlight530 - 2025-11-24
+通过 Nexent 实现了 Router-Worker 架构的完美落地。无论是构建高情商的拟人化伴侣，还是处理严苛的结构化数据约束，这套框架都游刃有余。多智能体编排体验极佳！
+:::
diff --git a/sdk/nexent/core/agents/nexent_agent.py b/sdk/nexent/core/agents/nexent_agent.py
@@ -83,11 +83,6 @@ def create_local_tool(self, tool_config: ToolConfig):
                     "vdb_core", None) if tool_config.metadata else None
                 tools_obj.embedding_model = tool_config.metadata.get(
                     "embedding_model", None) if tool_config.metadata else None
-                tools_obj = tool_class(index_names=tool_config.metadata.get("index_names", []),
-                                       observer=self.observer,
-                                       vdb_core=tool_config.metadata.get("vdb_core", []),
-                                       embedding_model=tool_config.metadata.get("embedding_model", []),
-                                       **params)
             elif class_name == "AnalyzeImageTool":
                 tools_obj = tool_class(observer=self.observer,
                                        vlm_model=tool_config.metadata.get("vlm_model", []),
diff --git a/sdk/nexent/core/tools/analyze_image_tool.py b/sdk/nexent/core/tools/analyze_image_tool.py
@@ -1,51 +1,68 @@
+""""
+Analyze Image Tool
+
+Analyze images using a large language model.
+Supports images from S3, HTTP, and HTTPS URLs.
+"""
+
 import json
 import logging
 from io import BytesIO
+from typing import List, Union
 
 from jinja2 import Template, StrictUndefined
 from pydantic import Field
 from smolagents.tools import Tool
 
-from ..models.openai_vlm import OpenAIVLModel
-from ..utils.observer import MessageObserver, ProcessType
-from ..utils.prompt_template_utils import get_prompt_template
-from ..utils.tools_common_message import ToolCategory, ToolSign
-from ... import MinIOStorageClient
-from ...multi_modal.load_save_object import LoadSaveObjectManager
+from nexent.core.models import OpenAIVLModel
+from nexent.core.utils.observer import MessageObserver, ProcessType
+from nexent.core.utils.prompt_template_utils import get_prompt_template
+from nexent.core.utils.tools_common_message import ToolCategory, ToolSign
+from nexent.storage import MinIOStorageClient
+from nexent.multi_modal.load_save_object import LoadSaveObjectManager
 
 logger = logging.getLogger("analyze_image_tool")
 
 
 class AnalyzeImageTool(Tool):
-    """Tool for understanding and analyzing image"""
+    """Tool for understanding and analyzing image using a visual language model"""
 
     name = "analyze_image"
     description = (
-        "This tool uses a visual language model to understand images based on your query and then returns a description of the image."
-        "It's used to understand and analyze images stored in S3 buckets, via HTTP and HTTPS."
+        "This tool uses a visual language model to understand images based on your query and then returns a description of the image.\n"
+        "It is used to understand and analyze multiple images, with image sources supporting S3 URLs (s3://bucket/key or /bucket/key), "
+        "HTTP, and HTTPS URLs.\n"
         "Use this tool when you want to retrieve information contained in an image and provide the image's URL and your query."
     )
     inputs = {
-        "image_url": {
-            "type": "string",
-            "description": "URL of the image to analyze (e.g., 's3://bucket/path/to/image.png',"
-                           "'http://image.png', 'https://image.png')."
+        "image_urls_list": {
+            "type": "array",
+            "description": "List of image URLs (S3, HTTP, or HTTPS). Supports s3://bucket/key, /bucket/key, http://, and https:// URLs. "
+                           "Can also accept a single image URL which will be treated as a list with one element.",
         },
         "query": {
             "type": "string",
-            "description": "The user query to perform."
+            "description": "User's question to guide the analysis"
         }
     }
     output_type = "string"
-    # todo
     category = ToolCategory.FILE.value
     tool_sign = ToolSign.FILE_OPERATION.value
 
     def __init__(
             self,
-            observer: MessageObserver = Field(description="Message observer", default=None, exclude=True),
-            vlm_model: OpenAIVLModel = Field(description="The VLM model to use", default=None, exclude=True),
-            storage_client: MinIOStorageClient = Field(description="Storage client to use", default=None, exclude=True),
+            observer: MessageObserver = Field(
+                description="Message observer",
+                default=None,
+                exclude=True),
+            vlm_model: OpenAIVLModel = Field(
+                description="The VLM model to use",
+                default=None,
+                exclude=True),
+            storage_client: MinIOStorageClient = Field(
+                description="Storage client for downloading files from S3 URLs、HTTP URLs、HTTPS URLs.",
+                default=None,
+                exclude=True)
     ):
         super().__init__()
         self.observer = observer
@@ -55,49 +72,74 @@ def __init__(
         self.mm = LoadSaveObjectManager(storage_client=self.storage_client)
 
         # Dynamically apply the load_object decorator to forward method
-        self.forward = self.mm.load_object(input_names=["image_url"])(self._forward_impl)
+        self.forward = self.mm.load_object(input_names=["image_urls_list"])(self._forward_impl)
 
         self.running_prompt_zh = "正在分析图片..."
         self.running_prompt_en = "Analyzing image..."
 
-    def _forward_impl(self, image_url: bytes, query: str) -> str:
+    def _forward_impl(self, image_urls_list: Union[bytes, List[bytes]], query: str) -> Union[str, List[str]]:
         """
         Analyze images of S3 URL, HTTP URL, or HTTPS URL and return the identified text.
         
         Note: This method is wrapped by load_object decorator which downloads
         the image from S3 URL, HTTP URL, or HTTPS URL and passes bytes to this method.
 
         Args:
-            image_url: Image bytes (converted from S3 URL, HTTP URL, or HTTPS URL by decorator).
+            image_urls_list: image bytes or a sequence of image bytes (converted from URLs by the decorator).
+                             The load_object decorator converts URLs to bytes before calling this method.
+            query: User's question to guide the analysis
 
         Returns:
-            JSON string containing the recognized text.
+            Union[str, List[str]]: Single analysis string for one image or a list
+            of analysis strings that align with the order of the provided images.
 
         Raises:
             Exception: If the image cannot be downloaded or analyzed.
         """
-        # Note: image_url is now bytes after decorator processing
-        image_stream = BytesIO(image_url)
+        if image_urls_list is None:
+            raise ValueError("image_urls cannot be None")
+
+        if isinstance(image_urls_list, (list, tuple)):
+            image_urls_list: List[bytes] = list(image_urls_list)
+        elif isinstance(image_urls_list, bytes):
+            image_urls_list = [image_urls_list]
+        else:
+            raise ValueError("image_urls must be bytes or a list/tuple of bytes")
+
+        if len(image_urls_list) == 0:
+            raise ValueError("image_urls must contain at least one image")
 
         # Send tool run message
         if self.observer:
             running_prompt = self.running_prompt_zh if self.observer.lang == "zh" else self.running_prompt_en
             self.observer.add_message("", ProcessType.TOOL, running_prompt)
-            card_content = [{"icon": "image", "text": "Analyzing image..."}]
+            card_content = [{"icon": "image", "text": f"Analyzing images..."}]
             self.observer.add_message("", ProcessType.CARD, json.dumps(card_content, ensure_ascii=False))
 
         # Load prompts from yaml file
-        prompts = get_prompt_template(template_type='analyze_image', language=self.observer.lang)
+        language = self.observer.lang if self.observer else "en"
+        prompts = get_prompt_template(template_type='analyze_image', language=language)
+        system_prompt = Template(prompts['system_prompt'], undefined=StrictUndefined).render({'query': query})
 
         try:
-
-            response = self.vlm_model.analyze_image(
-                image_input=image_stream,
-                system_prompt=Template(prompts['system_prompt'], undefined=StrictUndefined).render({'query': query}))
+            analysis_results: List[str] = []
+            for index, image_bytes in enumerate(image_urls_list, start=1):
+                logger.info(f"Extracting image #{index}, query: {query}")
+                image_stream = BytesIO(image_bytes)
+                try:
+                    response = self.vlm_model.analyze_image(
+                        image_input=image_stream,
+                        system_prompt=system_prompt
+                    )
+                except Exception as e:
+                    raise Exception(f"Error understanding image {index}: {str(e)}")
+
+                analysis_results.append(response.content)
+
+            if len(analysis_results) == 1:
+                return analysis_results[0]
+            return analysis_results
         except Exception as e:
-            raise Exception(f"Error understanding image: {str(e)}")
-        text = response.content
-        # Record the detailed content of this search
-        # todo 返回的结构体是什么？
-        search_results_data = {'text': text}
-        return json.dumps(search_results_data, ensure_ascii=False)
+            logger.error(f"Error analyzing image: {str(e)}", exc_info=True)
+            error_msg = f"Error analyzing image: {str(e)}"
+            raise Exception(error_msg)

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,4 @@`
`1`	`1`	`import logging`
`2`		`-`
`3`	`2`	`from utils.logging_utils import configure_logging`
`4`	`3`	`from fastmcp import FastMCP`
`5`	`4`	`from tool_collection.mcp.local_mcp_service import local_mcp_service`