✨ image to text tool

Zhi-a · Zhi-a · commit 1064d382d8a5 · 2025-11-20T15:35:36.000+08:00
diff --git a/backend/agents/create_agent_info.py b/backend/agents/create_agent_info.py
@@ -25,6 +25,10 @@
 from utils.config_utils import tenant_config_manager, get_model_name_from_config
 from consts.const import LOCAL_MCP_SERVER, MODEL_CONFIG_MAPPING, LANGUAGE
 
+from backend.database.client import minio_client
+from backend.services.image_service import get_vlm_model
+from backend.utils.prompt_template_utils import get_analyze_file_prompt_template
+
 logger = logging.getLogger("create_agent_info")
 logger.setLevel(logging.DEBUG)
 
@@ -236,6 +240,18 @@ async def create_tool_config_list(agent_id, tenant_id, user_id):
                 "vdb_core": get_vector_db_core(),
                 "embedding_model": get_embedding_model(tenant_id=tenant_id),
             }
+        elif tool_config.class_name == "ImageUnderstandingTool":
+            # Load prompts from yaml file
+            language = 'zh'
+            prompts = get_analyze_file_prompt_template(language)
+            system_prompt_template = Template(prompts['image_analysis']['system_prompt'],
+                                              undefined=StrictUndefined)
+            tool_config.metadata = {
+                "vlm_model": get_vlm_model(tenant_id=tenant_id),
+                "storage_client": minio_client,
+                "system_prompt_template": system_prompt_template,
+            }
+
         tool_config_list.append(tool_config)
 
     return tool_config_list
diff --git a/backend/mcp_service.py b/backend/mcp_service.py
@@ -1,4 +1,7 @@
 import logging
+
+from tool_collection.mcp.blur_image_tool import local_blur_image
+from tool_collection.mcp.get_image_by_s3_url_tool import local_get_image_by_s3_url_tool
 from utils.logging_utils import configure_logging
 from fastmcp import FastMCP
 from tool_collection.mcp.local_mcp_service import local_mcp_service
@@ -17,6 +20,8 @@
 
 # mount local service (stable, not affected by remote proxy)
 nexent_mcp.mount(local_mcp_service.name, local_mcp_service)
+nexent_mcp.mount(local_blur_image.name, local_blur_image)
+nexent_mcp.mount(local_get_image_by_s3_url_tool.name, local_get_image_by_s3_url_tool)
 
 if __name__ == "__main__":
     nexent_mcp.run(transport="sse", host="0.0.0.0", port=5011)
diff --git a/backend/services/image_service.py b/backend/services/image_service.py
@@ -4,6 +4,11 @@
 import aiohttp
 
 from consts.const import DATA_PROCESS_SERVICE
+from nexent import MessageObserver
+from nexent.core.models import OpenAIVLModel
+
+from backend.consts.const import MODEL_CONFIG_MAPPING
+from backend.utils.config_utils import tenant_config_manager, get_model_name_from_config
 
 logger = logging.getLogger("image_service")
 
@@ -23,3 +28,19 @@ async def proxy_image_impl(decoded_url: str):
 
             result = await response.json()
             return result
+
+def get_vlm_model(tenant_id: str):
+    # Get the tenant config
+    vlm_model_config = tenant_config_manager.get_model_config(
+        key=MODEL_CONFIG_MAPPING["vlm"], tenant_id=tenant_id)
+    return OpenAIVLModel(
+                observer=MessageObserver(),
+                model_id=get_model_name_from_config(
+                    vlm_model_config) if vlm_model_config else "",
+                api_base=vlm_model_config.get("base_url", ""),
+                api_key=vlm_model_config.get("api_key", ""),
+                temperature=0.7,
+                top_p=0.7,
+                frequency_penalty=0.5,
+                max_tokens=512
+            )
diff --git a/backend/services/tool_configuration_service.py b/backend/services/tool_configuration_service.py
@@ -27,12 +27,9 @@
 from services.vectordatabase_service import get_embedding_model, get_vector_db_core
 from services.tenant_config_service import get_selected_knowledge_list
 
-from backend.consts.const import MODEL_CONFIG_MAPPING
 from backend.database.client import minio_client, MinioClient
-from backend.utils.config_utils import tenant_config_manager, get_model_name_from_config
+from backend.services.image_service import get_vlm_model
 from backend.utils.prompt_template_utils import get_analyze_file_prompt_template
-from sdk.nexent import MessageObserver
-from sdk.nexent.core.models import OpenAIVLModel
 
 logger = logging.getLogger("tool_configuration_service")
 
@@ -621,22 +618,10 @@ def _validate_local_tool(
                 'embedding_model': embedding_model,
             }
             tool_instance = tool_class(**params)
-        elif tool_name == "image_text_understanding_tool":
+        elif tool_name == "image_text_understanding":
             if not tenant_id or not user_id:
                 raise ToolExecutionException(f"Tenant ID and User ID are required for {tool_name} validation")
-            vlm_model_config = tenant_config_manager.get_model_config(
-                key=MODEL_CONFIG_MAPPING["vlm"], tenant_id=tenant_id)
-            image_to_text_model = OpenAIVLModel(
-                observer=MessageObserver(),
-                model_id=get_model_name_from_config(
-                    vlm_model_config) if vlm_model_config else "",
-                api_base=vlm_model_config.get("base_url", ""),
-                api_key=vlm_model_config.get("api_key", ""),
-                temperature=0.7,
-                top_p=0.7,
-                frequency_penalty=0.5,
-                max_tokens=512
-            )
+            image_to_text_model = get_vlm_model(tenant_id=tenant_id)
             # Load prompts from yaml file
             language = 'zh'
             prompts = get_analyze_file_prompt_template(language)
diff --git a/sdk/nexent/core/agents/nexent_agent.py b/sdk/nexent/core/agents/nexent_agent.py
@@ -71,6 +71,12 @@ def create_local_tool(self, tool_config: ToolConfig):
                                        vdb_core=tool_config.metadata.get("vdb_core", []),
                                        embedding_model=tool_config.metadata.get("embedding_model", []),
                                        **params)
+            elif class_name == "ImageUnderstandingTool":
+                tools_obj = tool_class(observer=self.observer,
+                                       vlm_model=tool_config.metadata.get("vlm_model", []),
+                                       storage_client=tool_config.metadata.get("storage_client", []),
+                                       system_prompt_template=tool_config.metadata.get("system_prompt_template", []),
+                                       **params)
             else:
                 tools_obj = tool_class(**params)
                 if hasattr(tools_obj, 'observer'):
diff --git a/sdk/nexent/core/tools/__init__.py b/sdk/nexent/core/tools/__init__.py
@@ -12,7 +12,7 @@
 from .move_item_tool import MoveItemTool
 from .list_directory_tool import ListDirectoryTool
 from .terminal_tool import TerminalTool
-from .image_text_understanding_tool import ImageTextUnderstandingTool
+from .image_understanding_tool import ImageUnderstandingTool
 
 __all__ = [
     "ExaSearchTool", 
@@ -29,5 +29,5 @@
     "MoveItemTool",
     "ListDirectoryTool",
     "TerminalTool",
-    "ImageTextUnderstandingTool"
+    "ImageUnderstandingTool"
 ]
diff --git a/sdk/nexent/core/tools/image_understanding_tool.py b/sdk/nexent/core/tools/image_understanding_tool.py
@@ -0,0 +1,106 @@
+import json
+import logging
+from io import BytesIO
+
+from jinja2 import Template
+from pydantic import Field
+from smolagents.tools import Tool
+
+from ..models.openai_vlm import OpenAIVLModel
+from ..utils.observer import MessageObserver, ProcessType
+from ..utils.tools_common_message import ToolCategory, ToolSign
+from ... import MinIOStorageClient
+from ...multi_modal.load_save_object import LoadSaveObjectManager
+
+logger = logging.getLogger("image_understanding_tool")
+
+
+class ImageUnderstandingTool(Tool):
+    """Tool for extracting text from images stored in S3-compatible storage."""
+
+    name = "image_understanding"
+    description = (
+        "Understand an image stored in S3-compatible storage or HTTP and return the text content inside the image. "
+        "Provide the object location via an s3:// URL or http:// URL or https:// URL."
+    )
+    inputs = {
+        "image_url": {
+            "type": "string",
+            "description": "URL of the image to analyze (e.g., 's3://bucket/path/to/image.png',"
+                           "'http://image.png', 'https://image.png')."
+        },
+        "query": {
+            "type": "string",
+            "description": "The user query to perform."
+        }
+    }
+    output_type = "string"
+    # todo
+    category = ToolCategory.FILE.value
+    tool_sign = ToolSign.FILE_OPERATION.value
+
+    def __init__(
+            self,
+            observer: MessageObserver = Field(description="Message observer", default=None, exclude=True),
+            vlm_model: OpenAIVLModel = Field(description="The VLM model to use", default=None, exclude=True),
+            storage_client: MinIOStorageClient = Field(description="Storage client to use", default=None, exclude=True),
+            # todo 这么写对不对
+            system_prompt_template: Template = Field(description="System prompt template to use", default=None, exclude=True),
+    ):
+        super().__init__()
+        self.observer = observer
+        self.vlm_model = vlm_model
+        # Use provided storage_client or create a default one
+        # if storage_client is None:
+        #     storage_client = create_storage_client_from_config()
+        self.storage_client = storage_client
+        self.system_prompt_template = system_prompt_template
+
+
+        # Create LoadSaveObjectManager with the storage client
+        self.mm = LoadSaveObjectManager(storage_client=self.storage_client)
+
+        # Dynamically apply the load_object decorator to forward method
+        self.forward = self.mm.load_object(input_names=["image_url"])(self._forward_impl)
+
+        self.running_prompt_zh = "正在分析图片文字..."
+        self.running_prompt_en = "Analyzing image text..."
+
+    def _forward_impl(self, image_url: bytes, query: str) -> str:
+        """
+        Analyze the image specified by the S3 URL and return recognized text.
+        
+        Note: This method is wrapped by load_object decorator which downloads
+        the image from S3 URL and passes bytes to this method.
+
+        Args:
+            image_url: Image bytes (converted from S3 URL by decorator).
+
+        Returns:
+            JSON string containing the recognized text.
+
+        Raises:
+            Exception: If the image cannot be downloaded or analyzed.
+        """
+        # Note: image_url is now bytes after decorator processing
+        image_stream = BytesIO(image_url)
+
+        # Send tool run message
+        if self.observer:
+            running_prompt = self.running_prompt_zh if self.observer.lang == "zh" else self.running_prompt_en
+            self.observer.add_message("", ProcessType.TOOL, running_prompt)
+            card_content = [{"icon": "image", "text": "Processing image..."}]
+            self.observer.add_message("", ProcessType.CARD, json.dumps(card_content, ensure_ascii=False))
+
+        # # Load messages based on language
+        # messages = get_file_processing_messages_template(language)
+
+        try:
+            text = self.vlm_model.analyze_image(
+                image_input=image_stream,
+                system_prompt=self.system_prompt_template.render({'query': query})).content
+            return text
+            # return messages["IMAGE_CONTENT_SUCCESS"].format(filename=filename, content=text)
+        except Exception as e:
+            raise e
+