ModelEngine-Group
diff --git a/‎backend/agents/create_agent_info.py‎
Lines changed: 8 additions & 0 deletions b/‎backend/agents/create_agent_info.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎backend/services/image_service.py‎
Lines changed: 21 additions & 0 deletions b/‎backend/services/image_service.py‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎backend/services/tool_configuration_service.py‎
Lines changed: 12 additions & 0 deletions b/‎backend/services/tool_configuration_service.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎sdk/nexent/core/agents/nexent_agent.py‎
Lines changed: 5 additions & 0 deletions b/‎sdk/nexent/core/agents/nexent_agent.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎sdk/nexent/core/prompts/analyze_image.yaml‎
Lines changed: 14 additions & 0 deletions b/‎sdk/nexent/core/prompts/analyze_image.yaml‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎sdk/nexent/core/prompts/analyze_image_en.yaml‎
Lines changed: 13 additions & 0 deletions b/‎sdk/nexent/core/prompts/analyze_image_en.yaml‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎sdk/nexent/core/tools/__init__.py‎
Lines changed: 3 additions & 1 deletion b/‎sdk/nexent/core/tools/__init__.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎sdk/nexent/core/tools/analyze_image_tool.py‎
Lines changed: 138 additions & 0 deletions b/‎sdk/nexent/core/tools/analyze_image_tool.py‎
Lines changed: 138 additions & 0 deletions
diff --git a/‎sdk/nexent/core/utils/prompt_template_utils.py‎
Lines changed: 53 additions & 0 deletions b/‎sdk/nexent/core/utils/prompt_template_utils.py‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎sdk/nexent/core/utils/tools_common_message.py‎
Lines changed: 3 additions & 0 deletions b/‎sdk/nexent/core/utils/tools_common_message.py‎
Lines changed: 3 additions & 0 deletions
@@ -17,9 +17,11 @@
 from services.tenant_config_service import get_selected_knowledge_list
 from services.remote_mcp_service import get_remote_mcp_server_list
 from services.memory_config_service import build_memory_context
+from services.image_service import get_vlm_model
 from database.agent_db import search_agent_info_by_agent_id, query_sub_agents_id_list
 from database.tool_db import search_tools_for_sub_agent
 from database.model_management_db import get_model_records, get_model_by_model_id
+from database.client import minio_client
 from utils.model_name_utils import add_repo_to_name
 from utils.prompt_template_utils import get_agent_prompt_template
 from utils.config_utils import tenant_config_manager, get_model_name_from_config
@@ -236,6 +238,12 @@ async def create_tool_config_list(agent_id, tenant_id, user_id):
                 "vdb_core": get_vector_db_core(),
                 "embedding_model": get_embedding_model(tenant_id=tenant_id),
             }
+        elif tool_config.class_name == "AnalyzeImageTool":
+            tool_config.metadata = {
+                "vlm_model": get_vlm_model(tenant_id=tenant_id),
+                "storage_client": minio_client,
+            }
+
         tool_config_list.append(tool_config)
 
     return tool_config_list
 
@@ -4,6 +4,11 @@
 import aiohttp
 
 from consts.const import DATA_PROCESS_SERVICE
+from consts.const import MODEL_CONFIG_MAPPING
+from utils.config_utils import tenant_config_manager, get_model_name_from_config
+
+from nexent import MessageObserver
+from nexent.core.models import OpenAIVLModel
 
 logger = logging.getLogger("image_service")
 
@@ -23,3 +28,19 @@ async def proxy_image_impl(decoded_url: str):
 
             result = await response.json()
             return result
+
+def get_vlm_model(tenant_id: str):
+    # Get the tenant config
+    vlm_model_config = tenant_config_manager.get_model_config(
+        key=MODEL_CONFIG_MAPPING["vlm"], tenant_id=tenant_id)
+    return OpenAIVLModel(
+                observer=MessageObserver(),
+                model_id=get_model_name_from_config(
+                    vlm_model_config) if vlm_model_config else "",
+                api_base=vlm_model_config.get("base_url", ""),
+                api_key=vlm_model_config.get("api_key", ""),
+                temperature=0.7,
+                top_p=0.7,
+                frequency_penalty=0.5,
+                max_tokens=512
+            )
@@ -25,6 +25,8 @@
 from database.user_tenant_db import get_all_tenant_ids
 from services.vectordatabase_service import get_embedding_model, get_vector_db_core
 from services.tenant_config_service import get_selected_knowledge_list
+from database.client import minio_client
+from services.image_service import get_vlm_model
 
 logger = logging.getLogger("tool_configuration_service")
 
@@ -613,6 +615,16 @@ def _validate_local_tool(
                 'embedding_model': embedding_model,
             }
             tool_instance = tool_class(**params)
+        elif tool_name == "analyze_image":
+            if not tenant_id or not user_id:
+                raise ToolExecutionException(f"Tenant ID and User ID are required for {tool_name} validation")
+            image_to_text_model = get_vlm_model(tenant_id=tenant_id)
+            params = {
+                **instantiation_params,
+                'vlm_model': image_to_text_model,
+                'storage_client': minio_client
+            }
+            tool_instance = tool_class(**params)
         else:
             tool_instance = tool_class(**instantiation_params)
 
 
@@ -83,6 +83,11 @@ def create_local_tool(self, tool_config: ToolConfig):
                     "vdb_core", None) if tool_config.metadata else None
                 tools_obj.embedding_model = tool_config.metadata.get(
                     "embedding_model", None) if tool_config.metadata else None
+            elif class_name == "AnalyzeImageTool":
+                tools_obj = tool_class(observer=self.observer,
+                                       vlm_model=tool_config.metadata.get("vlm_model", []),
+                                       storage_client=tool_config.metadata.get("storage_client", []),
+                                       **params)
             else:
                 tools_obj = tool_class(**params)
                 if hasattr(tools_obj, 'observer'):
 
@@ -0,0 +1,14 @@
+# 图片分析 Prompt 模板
+# 用于图片分析
+
+system_prompt: |-
+  用户提出了一个问题：{{ query }}，请从回答这个问题的角度精简、仔细描述一下这个图片，200字以内。
+  
+  **图片分析要求：**
+  1. 重点关注与用户问题相关的图片内容
+  2. 描述要精简明了，突出关键信息
+  3. 避免无关细节，专注于能帮助回答问题的内容
+  4. 保持客观描述，不要过度解读
+
+user_prompt: |
+  请仔细观察这张图片，并从回答用户问题的角度进行描述。
@@ -0,0 +1,13 @@
+# Image Understanding Prompt Templates
+
+system_prompt: |-
+  The user has asked a question: {{ query }}. Please provide a concise and careful description of this image from the perspective of answering this question, within 200 words.
+  
+  **Image Analysis Requirements:**
+  1. Focus on image content relevant to the user's question
+  2. Keep descriptions concise and clear, highlighting key information
+  3. Avoid irrelevant details, focus on content that helps answer the question
+  4. Maintain objective description, avoid over-interpretation
+
+user_prompt: |
+  Please carefully observe this image and describe it from the perspective of answering the user's question.
@@ -12,6 +12,7 @@
 from .move_item_tool import MoveItemTool
 from .list_directory_tool import ListDirectoryTool
 from .terminal_tool import TerminalTool
+from .analyze_image_tool import AnalyzeImageTool
 
 __all__ = [
     "ExaSearchTool", 
@@ -27,5 +28,6 @@
     "DeleteDirectoryTool",
     "MoveItemTool",
     "ListDirectoryTool",
-    "TerminalTool"
+    "TerminalTool",
+    "AnalyzeImageTool"
 ]
@@ -0,0 +1,138 @@
+""""
+Analyze Image Tool
+
+Analyze images using a large language model.
+Supports images from S3, HTTP, and HTTPS URLs.
+"""
+
+import json
+import logging
+from io import BytesIO
+from typing import List
+
+from jinja2 import Template, StrictUndefined
+from pydantic import Field
+from smolagents.tools import Tool
+
+from nexent.core.models import OpenAIVLModel
+from nexent.core.utils.observer import MessageObserver, ProcessType
+from nexent.core.utils.prompt_template_utils import get_prompt_template
+from nexent.core.utils.tools_common_message import ToolCategory, ToolSign
+from nexent.storage import MinIOStorageClient
+from nexent.multi_modal.load_save_object import LoadSaveObjectManager
+
+logger = logging.getLogger("analyze_image_tool")
+
+
+class AnalyzeImageTool(Tool):
+    """Tool for understanding and analyzing image using a visual language model"""
+
+    name = "analyze_image"
+    description = (
+        "This tool uses a visual language model to understand images based on your query and then returns a description of the image.\n"
+        "It is used to understand and analyze multiple images, with image sources supporting S3 URLs (s3://bucket/key or /bucket/key), "
+        "HTTP, and HTTPS URLs.\n"
+        "Use this tool when you want to retrieve information contained in an image and provide the image's URL and your query."
+    )
+    inputs = {
+        "image_urls_list": {
+            "type": "array",
+            "description": "List of image URLs (S3, HTTP, or HTTPS). Supports s3://bucket/key, /bucket/key, http://, and https:// URLs.",
+        },
+        "query": {
+            "type": "string",
+            "description": "User's question to guide the analysis"
+        }
+    }
+    output_type = "array"
+    category = ToolCategory.MULTIMODAL.value
+    tool_sign = ToolSign.MULTIMODAL_OPERATION.value
+
+    def __init__(
+            self,
+            observer: MessageObserver = Field(
+                description="Message observer",
+                default=None,
+                exclude=True),
+            vlm_model: OpenAIVLModel = Field(
+                description="The VLM model to use",
+                default=None,
+                exclude=True),
+            storage_client: MinIOStorageClient = Field(
+                description="Storage client for downloading files from S3 URLs、HTTP URLs、HTTPS URLs.",
+                default=None,
+                exclude=True)
+    ):
+        super().__init__()
+        self.observer = observer
+        self.vlm_model = vlm_model
+        self.storage_client = storage_client
+        # Create LoadSaveObjectManager with the storage client
+        self.mm = LoadSaveObjectManager(storage_client=self.storage_client)
+
+        # Dynamically apply the load_object decorator to forward method
+        self.forward = self.mm.load_object(input_names=["image_urls_list"])(self._forward_impl)
+
+        self.running_prompt_zh = "正在分析图片..."
+        self.running_prompt_en = "Analyzing image..."
+
+    def _forward_impl(self, image_urls_list: List[bytes], query: str) -> List[str]:
+        """
+        Analyze images identified by S3 URL, HTTP URL, or HTTPS URL and return the identified text.
+        
+        Note: This method is wrapped by load_object decorator which downloads
+        the image from S3 URL, HTTP URL, or HTTPS URL and passes bytes to this method.
+
+        Args:
+            image_urls_list: List of image bytes converted from URLs by the decorator.
+                             The load_object decorator converts URLs to bytes before calling this method.
+            query: User's question to guide the analysis
+
+        Returns:
+            List[str]: One analysis string per image that aligns with the order
+            of the provided images.
+
+        Raises:
+            Exception: If the image cannot be downloaded or analyzed.
+        """
+        # Send tool run message
+        if self.observer:
+            running_prompt = self.running_prompt_zh if self.observer.lang == "zh" else self.running_prompt_en
+            self.observer.add_message("", ProcessType.TOOL, running_prompt)
+            card_content = [{"icon": "image", "text": f"Analyzing images..."}]
+            self.observer.add_message("", ProcessType.CARD, json.dumps(card_content, ensure_ascii=False))
+
+        if image_urls_list is None:
+            raise ValueError("image_urls cannot be None")
+
+        if not isinstance(image_urls_list, list):
+            raise ValueError("image_urls must be a list of bytes")
+
+        if not image_urls_list:
+            raise ValueError("image_urls must contain at least one image")
+
+        # Load prompts from yaml file
+        language = self.observer.lang if self.observer else "en"
+        prompts = get_prompt_template(template_type='analyze_image', language=language)
+        system_prompt = Template(prompts['system_prompt'], undefined=StrictUndefined).render({'query': query})
+
+        try:
+            analysis_results: List[str] = []
+            for index, image_bytes in enumerate(image_urls_list, start=1):
+                logger.info(f"Extracting image #{index}, query: {query}")
+                image_stream = BytesIO(image_bytes)
+                try:
+                    response = self.vlm_model.analyze_image(
+                        image_input=image_stream,
+                        system_prompt=system_prompt
+                    )
+                except Exception as e:
+                    raise Exception(f"Error understanding image {index}: {str(e)}")
+
+                analysis_results.append(response.content)
+
+            return analysis_results
+        except Exception as e:
+            logger.error(f"Error analyzing image: {str(e)}", exc_info=True)
+            error_msg = f"Error analyzing image: {str(e)}"
+            raise Exception(error_msg)
@@ -0,0 +1,53 @@
+import logging
+import os
+from typing import Dict, Any
+
+import yaml
+
+
+logger = logging.getLogger("prompt_template_utils")
+
+LANGUAGE = {
+    "ZH": "zh",
+    "EN": "en"
+}
+
+# Define template path mapping
+template_paths = {
+    'analyze_image': {
+        LANGUAGE["ZH"]: 'core/prompts/analyze_image.yaml',
+        LANGUAGE["EN"]: 'core/prompts/analyze_image_en.yaml'
+    }
+}
+
+def get_prompt_template(template_type: str, language: str = LANGUAGE["ZH"], **kwargs) -> Dict[str, Any]:
+    """
+    Get prompt template
+
+    Args:
+        template_type: Template type, supports the following values:
+            - 'analyze_image': Analyze image template
+        language: Language code ('zh' or 'en')
+        **kwargs: Additional parameters, for agent type need to pass is_manager parameter
+
+    Returns:
+        dict: Loaded prompt template
+    """
+    logger.info(
+        f"Getting prompt template for type: {template_type}, language: {language}, kwargs: {kwargs}")
+
+    if template_type not in template_paths:
+        raise ValueError(f"Unsupported template type: {template_type}")
+
+    # Get template path
+    template_path = template_paths[template_type][language]
+
+    # Get the directory of this file and construct absolute path
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    # Go up one level from utils to core, then use the template path
+    core_dir = os.path.dirname(current_dir)
+    absolute_template_path = os.path.join(core_dir, template_path.replace('core/', ''))
+    
+    # Read and return template content
+    with open(absolute_template_path, 'r', encoding='utf-8') as f:
+        return yaml.safe_load(f)
@@ -11,6 +11,7 @@ class ToolSign(Enum):
     TAVILY_SEARCH = "d"  # Tavily search tool identifier
     FILE_OPERATION = "f"      # File operation tool identifier
     TERMINAL_OPERATION = "t"  # Terminal operation tool identifier
+    MULTIMODAL_OPERATION = "m" # Multimodal operation tool identifier
 
 
 # Tool sign mapping for backward compatibility
@@ -21,6 +22,7 @@ class ToolSign(Enum):
     "exa_search": ToolSign.EXA_SEARCH.value,
     "file_operation": ToolSign.FILE_OPERATION.value,
     "terminal_operation": ToolSign.TERMINAL_OPERATION.value,
+    "multimodal_operation": ToolSign.MULTIMODAL_OPERATION.value,
 }
 
 # Reverse mapping for lookup
@@ -33,6 +35,7 @@ class ToolCategory(Enum):
     FILE = "file"
     EMAIL = "email"
     TERMINAL = "terminal"
+    MULTIMODAL = "multimodal"
 
 
 @dataclass