✨ image to text tool

Zhi-a · Zhi-a · commit 82b32b36ca8c · 2025-11-20T10:44:30.000+08:00
diff --git a/backend/services/tool_configuration_service.py b/backend/services/tool_configuration_service.py
@@ -6,6 +6,7 @@
 from typing import Any, List, Optional, Dict
 from urllib.parse import urljoin
 
+from jinja2 import Template, StrictUndefined
 from pydantic_core import PydanticUndefined
 from fastmcp import Client
 import jsonref
@@ -26,6 +27,13 @@
 from services.elasticsearch_service import get_embedding_model, elastic_core
 from services.tenant_config_service import get_selected_knowledge_list
 
+from backend.consts.const import MODEL_CONFIG_MAPPING
+from backend.database.client import minio_client, MinioClient
+from backend.utils.config_utils import tenant_config_manager, get_model_name_from_config
+from backend.utils.prompt_template_utils import get_analyze_file_prompt_template
+from sdk.nexent import MessageObserver
+from sdk.nexent.core.models import OpenAIVLModel
+
 logger = logging.getLogger("tool_configuration_service")
 
 
@@ -612,6 +620,35 @@ def _validate_local_tool(
                 'embedding_model': embedding_model
             }
             tool_instance = tool_class(**params)
+        elif tool_name == "image_text_understanding_tool":
+            if not tenant_id or not user_id:
+                raise ToolExecutionException(f"Tenant ID and User ID are required for {tool_name} validation")
+            vlm_model_config = tenant_config_manager.get_model_config(
+                key=MODEL_CONFIG_MAPPING["vlm"], tenant_id=tenant_id)
+            image_to_text_model = OpenAIVLModel(
+                observer=MessageObserver(),
+                model_id=get_model_name_from_config(
+                    vlm_model_config) if vlm_model_config else "",
+                api_base=vlm_model_config.get("base_url", ""),
+                api_key=vlm_model_config.get("api_key", ""),
+                temperature=0.7,
+                top_p=0.7,
+                frequency_penalty=0.5,
+                max_tokens=512
+            )
+            # Load prompts from yaml file
+            language = 'zh'
+            prompts = get_analyze_file_prompt_template(language)
+            system_prompt_template = Template(prompts['image_analysis']['system_prompt'],
+                                     undefined=StrictUndefined)
+
+            params = {
+                **instantiation_params,
+                'vlm_model': image_to_text_model,
+                'storage_client': minio_client,
+                'system_prompt_template': system_prompt_template
+            }
+            tool_instance = tool_class(**params)
         else:
             tool_instance = tool_class(**instantiation_params)
 
diff --git a/sdk/nexent/core/tools/__init__.py b/sdk/nexent/core/tools/__init__.py
@@ -12,6 +12,7 @@
 from .move_item_tool import MoveItemTool
 from .list_directory_tool import ListDirectoryTool
 from .terminal_tool import TerminalTool
+from .image_text_understanding_tool import ImageTextUnderstandingTool
 
 __all__ = [
     "ExaSearchTool", 
@@ -27,5 +28,6 @@
     "DeleteDirectoryTool",
     "MoveItemTool",
     "ListDirectoryTool",
-    "TerminalTool"
+    "TerminalTool",
+    "ImageTextUnderstandingTool"
 ]