✨ image to text tool

Zhi-a · Zhi-a · commit be133cd9aa9c · 2025-11-24T09:57:08.000+08:00
diff --git a/backend/agents/create_agent_info.py b/backend/agents/create_agent_info.py
@@ -240,7 +240,7 @@ async def create_tool_config_list(agent_id, tenant_id, user_id):
                 "vdb_core": get_vector_db_core(),
                 "embedding_model": get_embedding_model(tenant_id=tenant_id),
             }
-        elif tool_config.class_name == "ImageUnderstandingTool":
+        elif tool_config.class_name == "AnalyzeImageTool":
             tool_config.metadata = {
                 "vlm_model": get_vlm_model(tenant_id=tenant_id),
                 "storage_client": minio_client,
diff --git a/backend/services/tool_configuration_service.py b/backend/services/tool_configuration_service.py
@@ -618,7 +618,7 @@ def _validate_local_tool(
                 'embedding_model': embedding_model,
             }
             tool_instance = tool_class(**params)
-        elif tool_name == "image_understanding":
+        elif tool_name == "analyze_image":
             if not tenant_id or not user_id:
                 raise ToolExecutionException(f"Tenant ID and User ID are required for {tool_name} validation")
             image_to_text_model = get_vlm_model(tenant_id=tenant_id)
diff --git a/sdk/nexent/core/agents/nexent_agent.py b/sdk/nexent/core/agents/nexent_agent.py
@@ -71,7 +71,7 @@ def create_local_tool(self, tool_config: ToolConfig):
                                        vdb_core=tool_config.metadata.get("vdb_core", []),
                                        embedding_model=tool_config.metadata.get("embedding_model", []),
                                        **params)
-            elif class_name == "ImageUnderstandingTool":
+            elif class_name == "AnalyzeImageTool":
                 tools_obj = tool_class(observer=self.observer,
                                        vlm_model=tool_config.metadata.get("vlm_model", []),
                                        storage_client=tool_config.metadata.get("storage_client", []),
diff --git a/sdk/nexent/core/prompts/analyze_image.yaml b/sdk/nexent/core/prompts/analyze_image.yaml
diff --git a/sdk/nexent/core/prompts/analyze_image_en.yaml b/sdk/nexent/core/prompts/analyze_image_en.yaml
diff --git a/sdk/nexent/core/tools/__init__.py b/sdk/nexent/core/tools/__init__.py
@@ -12,7 +12,7 @@
 from .move_item_tool import MoveItemTool
 from .list_directory_tool import ListDirectoryTool
 from .terminal_tool import TerminalTool
-from .image_understanding_tool import ImageUnderstandingTool
+from .analyze_image_tool import AnalyzeImageTool
 
 __all__ = [
     "ExaSearchTool", 
@@ -29,5 +29,5 @@
     "MoveItemTool",
     "ListDirectoryTool",
     "TerminalTool",
-    "ImageUnderstandingTool"
+    "AnalyzeImageTool"
 ]
diff --git a/sdk/nexent/core/tools/analyze_image_tool.py b/sdk/nexent/core/tools/analyze_image_tool.py
@@ -13,16 +13,17 @@
 from ... import MinIOStorageClient
 from ...multi_modal.load_save_object import LoadSaveObjectManager
 
-logger = logging.getLogger("image_understanding_tool")
+logger = logging.getLogger("analyze_image_tool")
 
 
-class ImageUnderstandingTool(Tool):
-    """Tool for extracting text from images stored in S3-compatible storage."""
+class AnalyzeImageTool(Tool):
+    """Tool for understanding and analyzing image"""
 
-    name = "image_understanding"
+    name = "analyze_image"
     description = (
-        "Understand an image stored in S3-compatible storage or HTTP and return the text content inside the image. "
-        "Provide the object location via an s3:// URL or http:// URL or https:// URL."
+        "This tool uses a visual language model to understand images based on your query and then returns a description of the image."
+        "It's used to understand and analyze images stored in S3 buckets, via HTTP and HTTPS."
+        "Use this tool when you want to retrieve information contained in an image and provide the image's URL and your query."
     )
     inputs = {
         "image_url": {
@@ -45,32 +46,29 @@ def __init__(
             observer: MessageObserver = Field(description="Message observer", default=None, exclude=True),
             vlm_model: OpenAIVLModel = Field(description="The VLM model to use", default=None, exclude=True),
             storage_client: MinIOStorageClient = Field(description="Storage client to use", default=None, exclude=True),
-            # todo 这么写对不对
-            system_prompt_template: Template = Field(description="System prompt template to use", default=None, exclude=True),
     ):
         super().__init__()
         self.observer = observer
         self.vlm_model = vlm_model
         self.storage_client = storage_client
-        self.system_prompt_template = system_prompt_template
         # Create LoadSaveObjectManager with the storage client
         self.mm = LoadSaveObjectManager(storage_client=self.storage_client)
 
         # Dynamically apply the load_object decorator to forward method
         self.forward = self.mm.load_object(input_names=["image_url"])(self._forward_impl)
 
-        self.running_prompt_zh = "正在理解图片..."
-        self.running_prompt_en = "Understanding image..."
+        self.running_prompt_zh = "正在分析图片..."
+        self.running_prompt_en = "Analyzing image..."
 
     def _forward_impl(self, image_url: bytes, query: str) -> str:
         """
-        Analyze the image specified by the S3 URL and return recognized text.
+        Analyze images of S3 URL, HTTP URL, or HTTPS URL and return the identified text.
         
         Note: This method is wrapped by load_object decorator which downloads
-        the image from S3 URL and passes bytes to this method.
+        the image from S3 URL, HTTP URL, or HTTPS URL and passes bytes to this method.
 
         Args:
-            image_url: Image bytes (converted from S3 URL by decorator).
+            image_url: Image bytes (converted from S3 URL, HTTP URL, or HTTPS URL by decorator).
 
         Returns:
             JSON string containing the recognized text.
@@ -85,23 +83,21 @@ def _forward_impl(self, image_url: bytes, query: str) -> str:
         if self.observer:
             running_prompt = self.running_prompt_zh if self.observer.lang == "zh" else self.running_prompt_en
             self.observer.add_message("", ProcessType.TOOL, running_prompt)
-            card_content = [{"icon": "image", "text": "Processing image..."}]
+            card_content = [{"icon": "image", "text": "Analyzing image..."}]
             self.observer.add_message("", ProcessType.CARD, json.dumps(card_content, ensure_ascii=False))
 
         # Load prompts from yaml file
-        prompts = get_prompt_template(template_type='understand_image',language = self.observer.lang)
+        prompts = get_prompt_template(template_type='analyze_image', language=self.observer.lang)
 
         try:
 
             response = self.vlm_model.analyze_image(
                 image_input=image_stream,
-                system_prompt=Template(prompts['system_prompt'],undefined=StrictUndefined).render({'query': query}))
+                system_prompt=Template(prompts['system_prompt'], undefined=StrictUndefined).render({'query': query}))
         except Exception as e:
             raise Exception(f"Error understanding image: {str(e)}")
         text = response.content
         # Record the detailed content of this search
-        search_results_data = {'text':text}
-        if self.observer:
-            search_results_data = json.dumps(search_results_data, ensure_ascii=False)
-            self.observer.add_message("", ProcessType.SEARCH_CONTENT, search_results_data)
+        # todo 返回的结构体是什么？
+        search_results_data = {'text': text}
         return json.dumps(search_results_data, ensure_ascii=False)
diff --git a/sdk/nexent/core/utils/prompt_template_utils.py b/sdk/nexent/core/utils/prompt_template_utils.py
@@ -0,0 +1,49 @@
+import logging
+import os
+from typing import Dict, Any
+
+import yaml
+
+from consts.const import LANGUAGE
+
+logger = logging.getLogger("prompt_template_utils")
+
+# Define template path mapping
+template_paths = {
+    'analyze_image': {
+        LANGUAGE["ZH"]: 'core/prompts/analyze_image.yaml',
+        LANGUAGE["EN"]: 'core/prompts/analyze_image_en.yaml'
+    }
+}
+
+def get_prompt_template(template_type: str, language: str = LANGUAGE["ZH"], **kwargs) -> Dict[str, Any]:
+    """
+    Get prompt template
+
+    Args:
+        template_type: Template type, supports the following values:
+            - 'analyze_image': Analyze image template
+        language: Language code ('zh' or 'en')
+        **kwargs: Additional parameters, for agent type need to pass is_manager parameter
+
+    Returns:
+        dict: Loaded prompt template
+    """
+    logger.info(
+        f"Getting prompt template for type: {template_type}, language: {language}, kwargs: {kwargs}")
+
+    if template_type not in template_paths:
+        raise ValueError(f"Unsupported template type: {template_type}")
+
+    # Get template path
+    template_path = template_paths[template_type][language]
+
+    # Get the directory of this file and construct absolute path
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    # Go up one level from utils to core, then use the template path
+    core_dir = os.path.dirname(current_dir)
+    absolute_template_path = os.path.join(core_dir, template_path.replace('core/', ''))
+    
+    # Read and return template content
+    with open(absolute_template_path, 'r', encoding='utf-8') as f:
+        return yaml.safe_load(f)

Original file line number	Diff line number	Diff line change
`@@ -240,7 +240,7 @@ async def create_tool_config_list(agent_id, tenant_id, user_id):`
`240`	`240`	`"vdb_core": get_vector_db_core(),`
`241`	`241`	`"embedding_model": get_embedding_model(tenant_id=tenant_id),`
`242`	`242`	`}`
`243`		`- elif tool_config.class_name == "ImageUnderstandingTool":`
	`243`	`+ elif tool_config.class_name == "AnalyzeImageTool":`
`244`	`244`	`tool_config.metadata = {`
`245`	`245`	`"vlm_model": get_vlm_model(tenant_id=tenant_id),`
`246`	`246`	`"storage_client": minio_client,`
Original file line number	Diff line number	Diff line change
`@@ -618,7 +618,7 @@ def _validate_local_tool(`
`618`	`618`	`'embedding_model': embedding_model,`
`619`	`619`	`}`
`620`	`620`	`tool_instance = tool_class(**params)`
`621`		`- elif tool_name == "image_understanding":`
	`621`	`+ elif tool_name == "analyze_image":`
`622`	`622`	`if not tenant_id or not user_id:`
`623`	`623`	`raise ToolExecutionException(f"Tenant ID and User ID are required for {tool_name} validation")`
`624`	`624`	`image_to_text_model = get_vlm_model(tenant_id=tenant_id)`