✨ image to text tool

Zhi-a · Zhi-a · commit 906ca053d85a · 2025-11-21T11:10:12.000+08:00
diff --git a/backend/agents/create_agent_info.py b/backend/agents/create_agent_info.py
@@ -241,15 +241,9 @@ async def create_tool_config_list(agent_id, tenant_id, user_id):
                 "embedding_model": get_embedding_model(tenant_id=tenant_id),
             }
         elif tool_config.class_name == "ImageUnderstandingTool":
-            # Load prompts from yaml file
-            language = 'zh'
-            prompts = get_analyze_file_prompt_template(language)
-            system_prompt_template = Template(prompts['image_analysis']['system_prompt'],
-                                              undefined=StrictUndefined)
             tool_config.metadata = {
                 "vlm_model": get_vlm_model(tenant_id=tenant_id),
                 "storage_client": minio_client,
-                "system_prompt_template": system_prompt_template,
             }
 
         tool_config_list.append(tool_config)
diff --git a/backend/services/tool_configuration_service.py b/backend/services/tool_configuration_service.py
@@ -618,21 +618,14 @@ def _validate_local_tool(
                 'embedding_model': embedding_model,
             }
             tool_instance = tool_class(**params)
-        elif tool_name == "image_text_understanding":
+        elif tool_name == "image_understanding":
             if not tenant_id or not user_id:
                 raise ToolExecutionException(f"Tenant ID and User ID are required for {tool_name} validation")
             image_to_text_model = get_vlm_model(tenant_id=tenant_id)
-            # Load prompts from yaml file
-            language = 'zh'
-            prompts = get_analyze_file_prompt_template(language)
-            system_prompt_template = Template(prompts['image_analysis']['system_prompt'],
-                                     undefined=StrictUndefined)
-
             params = {
                 **instantiation_params,
                 'vlm_model': image_to_text_model,
-                'storage_client': minio_client,
-                'system_prompt_template': system_prompt_template
+                'storage_client': minio_client
             }
             tool_instance = tool_class(**params)
         else:
diff --git a/sdk/nexent/core/agents/nexent_agent.py b/sdk/nexent/core/agents/nexent_agent.py
@@ -75,7 +75,6 @@ def create_local_tool(self, tool_config: ToolConfig):
                 tools_obj = tool_class(observer=self.observer,
                                        vlm_model=tool_config.metadata.get("vlm_model", []),
                                        storage_client=tool_config.metadata.get("storage_client", []),
-                                       system_prompt_template=tool_config.metadata.get("system_prompt_template", []),
                                        **params)
             else:
                 tools_obj = tool_class(**params)
diff --git a/sdk/nexent/core/prompts/understand_image.yaml b/sdk/nexent/core/prompts/understand_image.yaml
@@ -0,0 +1,14 @@
+# 图片分析 Prompt 模板
+# 用于图片分析
+
+system_prompt: |-
+  用户提出了一个问题：{{ query }}，请从回答这个问题的角度精简、仔细描述一下这个图片，200字以内。
+  
+  **图片分析要求：**
+  1. 重点关注与用户问题相关的图片内容
+  2. 描述要精简明了，突出关键信息
+  3. 避免无关细节，专注于能帮助回答问题的内容
+  4. 保持客观描述，不要过度解读
+
+user_prompt: |
+  请仔细观察这张图片，并从回答用户问题的角度进行描述。
diff --git a/sdk/nexent/core/prompts/understand_image_en.yaml b/sdk/nexent/core/prompts/understand_image_en.yaml
@@ -0,0 +1,13 @@
+# Image Understanding Prompt Templates
+
+system_prompt: |-
+  The user has asked a question: {{ query }}. Please provide a concise and careful description of this image from the perspective of answering this question, within 200 words.
+  
+  **Image Analysis Requirements:**
+  1. Focus on image content relevant to the user's question
+  2. Keep descriptions concise and clear, highlighting key information
+  3. Avoid irrelevant details, focus on content that helps answer the question
+  4. Maintain objective description, avoid over-interpretation
+
+user_prompt: |
+  Please carefully observe this image and describe it from the perspective of answering the user's question.
diff --git a/sdk/nexent/core/tools/image_understanding_tool.py b/sdk/nexent/core/tools/image_understanding_tool.py
@@ -2,12 +2,13 @@
 import logging
 from io import BytesIO
 
-from jinja2 import Template
+from jinja2 import Template, StrictUndefined
 from pydantic import Field
 from smolagents.tools import Tool
 
 from ..models.openai_vlm import OpenAIVLModel
 from ..utils.observer import MessageObserver, ProcessType
+from ..utils.prompt_template_utils import get_prompt_template
 from ..utils.tools_common_message import ToolCategory, ToolSign
 from ... import MinIOStorageClient
 from ...multi_modal.load_save_object import LoadSaveObjectManager
@@ -50,21 +51,16 @@ def __init__(
         super().__init__()
         self.observer = observer
         self.vlm_model = vlm_model
-        # Use provided storage_client or create a default one
-        # if storage_client is None:
-        #     storage_client = create_storage_client_from_config()
         self.storage_client = storage_client
         self.system_prompt_template = system_prompt_template
-
-
         # Create LoadSaveObjectManager with the storage client
         self.mm = LoadSaveObjectManager(storage_client=self.storage_client)
 
         # Dynamically apply the load_object decorator to forward method
         self.forward = self.mm.load_object(input_names=["image_url"])(self._forward_impl)
 
-        self.running_prompt_zh = "正在分析图片文字..."
-        self.running_prompt_en = "Analyzing image text..."
+        self.running_prompt_zh = "正在理解图片..."
+        self.running_prompt_en = "Understanding image..."
 
     def _forward_impl(self, image_url: bytes, query: str) -> str:
         """
@@ -92,15 +88,20 @@ def _forward_impl(self, image_url: bytes, query: str) -> str:
             card_content = [{"icon": "image", "text": "Processing image..."}]
             self.observer.add_message("", ProcessType.CARD, json.dumps(card_content, ensure_ascii=False))
 
-        # # Load messages based on language
-        # messages = get_file_processing_messages_template(language)
+        # Load prompts from yaml file
+        prompts = get_prompt_template(template_type='understand_image',language = self.observer.lang)
 
         try:
-            text = self.vlm_model.analyze_image(
+
+            response = self.vlm_model.analyze_image(
                 image_input=image_stream,
-                system_prompt=self.system_prompt_template.render({'query': query})).content
-            return text
-            # return messages["IMAGE_CONTENT_SUCCESS"].format(filename=filename, content=text)
+                system_prompt=Template(prompts['system_prompt'],undefined=StrictUndefined).render({'query': query}))
         except Exception as e:
-            raise e
-
+            raise Exception(f"Error understanding image: {str(e)}")
+        text = response.content
+        # Record the detailed content of this search
+        search_results_data = {'text':text}
+        if self.observer:
+            search_results_data = json.dumps(search_results_data, ensure_ascii=False)
+            self.observer.add_message("", ProcessType.SEARCH_CONTENT, search_results_data)
+        return json.dumps(search_results_data, ensure_ascii=False)
diff --git a/sdk/nexent/multi_modal/load_save_object.py b/sdk/nexent/multi_modal/load_save_object.py
@@ -87,7 +87,7 @@ def _upload_bytes_to_minio(
             self,
             bytes_data: bytes,
             object_name: Optional[str] = None,
-            bucket: str = "multi-modal",
+            bucket: str = "nexent",
             content_type: str = "application/octet-stream",
     ) -> str:
         """
@@ -194,7 +194,7 @@ def save_object(
             self,
             output_names: List[str],
             output_transformers: Optional[List[Callable[[Any], bytes]]] = None,
-            bucket: str = "multi-modal",
+            bucket: str = "nexent",
     ):
         """
         Decorator factory that uploads outputs to storage after function execution.