Skip to content

Commit bef9329

Browse files
authored
✨ image to text tool from ModelEngine-Group/develop_image_tool
✨ image to text tool
2 parents 1c8864e + 116db18 commit bef9329

21 files changed

+1030
-56
lines changed

backend/agents/create_agent_info.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,11 @@
1717
from services.tenant_config_service import get_selected_knowledge_list
1818
from services.remote_mcp_service import get_remote_mcp_server_list
1919
from services.memory_config_service import build_memory_context
20+
from services.image_service import get_vlm_model
2021
from database.agent_db import search_agent_info_by_agent_id, query_sub_agents_id_list
2122
from database.tool_db import search_tools_for_sub_agent
2223
from database.model_management_db import get_model_records, get_model_by_model_id
24+
from database.client import minio_client
2325
from utils.model_name_utils import add_repo_to_name
2426
from utils.prompt_template_utils import get_agent_prompt_template
2527
from utils.config_utils import tenant_config_manager, get_model_name_from_config
@@ -236,6 +238,12 @@ async def create_tool_config_list(agent_id, tenant_id, user_id):
236238
"vdb_core": get_vector_db_core(),
237239
"embedding_model": get_embedding_model(tenant_id=tenant_id),
238240
}
241+
elif tool_config.class_name == "AnalyzeImageTool":
242+
tool_config.metadata = {
243+
"vlm_model": get_vlm_model(tenant_id=tenant_id),
244+
"storage_client": minio_client,
245+
}
246+
239247
tool_config_list.append(tool_config)
240248

241249
return tool_config_list

backend/services/image_service.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@
44
import aiohttp
55

66
from consts.const import DATA_PROCESS_SERVICE
7+
from consts.const import MODEL_CONFIG_MAPPING
8+
from utils.config_utils import tenant_config_manager, get_model_name_from_config
9+
10+
from nexent import MessageObserver
11+
from nexent.core.models import OpenAIVLModel
712

813
logger = logging.getLogger("image_service")
914

@@ -23,3 +28,19 @@ async def proxy_image_impl(decoded_url: str):
2328

2429
result = await response.json()
2530
return result
31+
32+
def get_vlm_model(tenant_id: str):
33+
# Get the tenant config
34+
vlm_model_config = tenant_config_manager.get_model_config(
35+
key=MODEL_CONFIG_MAPPING["vlm"], tenant_id=tenant_id)
36+
return OpenAIVLModel(
37+
observer=MessageObserver(),
38+
model_id=get_model_name_from_config(
39+
vlm_model_config) if vlm_model_config else "",
40+
api_base=vlm_model_config.get("base_url", ""),
41+
api_key=vlm_model_config.get("api_key", ""),
42+
temperature=0.7,
43+
top_p=0.7,
44+
frequency_penalty=0.5,
45+
max_tokens=512
46+
)

backend/services/tool_configuration_service.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
from database.user_tenant_db import get_all_tenant_ids
2626
from services.vectordatabase_service import get_embedding_model, get_vector_db_core
2727
from services.tenant_config_service import get_selected_knowledge_list
28+
from database.client import minio_client
29+
from services.image_service import get_vlm_model
2830

2931
logger = logging.getLogger("tool_configuration_service")
3032

@@ -613,6 +615,16 @@ def _validate_local_tool(
613615
'embedding_model': embedding_model,
614616
}
615617
tool_instance = tool_class(**params)
618+
elif tool_name == "analyze_image":
619+
if not tenant_id or not user_id:
620+
raise ToolExecutionException(f"Tenant ID and User ID are required for {tool_name} validation")
621+
image_to_text_model = get_vlm_model(tenant_id=tenant_id)
622+
params = {
623+
**instantiation_params,
624+
'vlm_model': image_to_text_model,
625+
'storage_client': minio_client
626+
}
627+
tool_instance = tool_class(**params)
616628
else:
617629
tool_instance = tool_class(**instantiation_params)
618630

sdk/nexent/core/agents/nexent_agent.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,11 @@ def create_local_tool(self, tool_config: ToolConfig):
8383
"vdb_core", None) if tool_config.metadata else None
8484
tools_obj.embedding_model = tool_config.metadata.get(
8585
"embedding_model", None) if tool_config.metadata else None
86+
elif class_name == "AnalyzeImageTool":
87+
tools_obj = tool_class(observer=self.observer,
88+
vlm_model=tool_config.metadata.get("vlm_model", []),
89+
storage_client=tool_config.metadata.get("storage_client", []),
90+
**params)
8691
else:
8792
tools_obj = tool_class(**params)
8893
if hasattr(tools_obj, 'observer'):
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# 图片分析 Prompt 模板
2+
# 用于图片分析
3+
4+
system_prompt: |-
5+
用户提出了一个问题:{{ query }},请从回答这个问题的角度精简、仔细描述一下这个图片,200字以内。
6+
7+
**图片分析要求:**
8+
1. 重点关注与用户问题相关的图片内容
9+
2. 描述要精简明了,突出关键信息
10+
3. 避免无关细节,专注于能帮助回答问题的内容
11+
4. 保持客观描述,不要过度解读
12+
13+
user_prompt: |
14+
请仔细观察这张图片,并从回答用户问题的角度进行描述。
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Image Understanding Prompt Templates
2+
3+
system_prompt: |-
4+
The user has asked a question: {{ query }}. Please provide a concise and careful description of this image from the perspective of answering this question, within 200 words.
5+
6+
**Image Analysis Requirements:**
7+
1. Focus on image content relevant to the user's question
8+
2. Keep descriptions concise and clear, highlighting key information
9+
3. Avoid irrelevant details, focus on content that helps answer the question
10+
4. Maintain objective description, avoid over-interpretation
11+
12+
user_prompt: |
13+
Please carefully observe this image and describe it from the perspective of answering the user's question.

sdk/nexent/core/tools/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from .move_item_tool import MoveItemTool
1313
from .list_directory_tool import ListDirectoryTool
1414
from .terminal_tool import TerminalTool
15+
from .analyze_image_tool import AnalyzeImageTool
1516

1617
__all__ = [
1718
"ExaSearchTool",
@@ -27,5 +28,6 @@
2728
"DeleteDirectoryTool",
2829
"MoveItemTool",
2930
"ListDirectoryTool",
30-
"TerminalTool"
31+
"TerminalTool",
32+
"AnalyzeImageTool"
3133
]
Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
""""
2+
Analyze Image Tool
3+
4+
Analyze images using a large language model.
5+
Supports images from S3, HTTP, and HTTPS URLs.
6+
"""
7+
8+
import json
9+
import logging
10+
from io import BytesIO
11+
from typing import List
12+
13+
from jinja2 import Template, StrictUndefined
14+
from pydantic import Field
15+
from smolagents.tools import Tool
16+
17+
from nexent.core.models import OpenAIVLModel
18+
from nexent.core.utils.observer import MessageObserver, ProcessType
19+
from nexent.core.utils.prompt_template_utils import get_prompt_template
20+
from nexent.core.utils.tools_common_message import ToolCategory, ToolSign
21+
from nexent.storage import MinIOStorageClient
22+
from nexent.multi_modal.load_save_object import LoadSaveObjectManager
23+
24+
logger = logging.getLogger("analyze_image_tool")
25+
26+
27+
class AnalyzeImageTool(Tool):
28+
"""Tool for understanding and analyzing image using a visual language model"""
29+
30+
name = "analyze_image"
31+
description = (
32+
"This tool uses a visual language model to understand images based on your query and then returns a description of the image.\n"
33+
"It is used to understand and analyze multiple images, with image sources supporting S3 URLs (s3://bucket/key or /bucket/key), "
34+
"HTTP, and HTTPS URLs.\n"
35+
"Use this tool when you want to retrieve information contained in an image and provide the image's URL and your query."
36+
)
37+
inputs = {
38+
"image_urls_list": {
39+
"type": "array",
40+
"description": "List of image URLs (S3, HTTP, or HTTPS). Supports s3://bucket/key, /bucket/key, http://, and https:// URLs.",
41+
},
42+
"query": {
43+
"type": "string",
44+
"description": "User's question to guide the analysis"
45+
}
46+
}
47+
output_type = "array"
48+
category = ToolCategory.MULTIMODAL.value
49+
tool_sign = ToolSign.MULTIMODAL_OPERATION.value
50+
51+
def __init__(
52+
self,
53+
observer: MessageObserver = Field(
54+
description="Message observer",
55+
default=None,
56+
exclude=True),
57+
vlm_model: OpenAIVLModel = Field(
58+
description="The VLM model to use",
59+
default=None,
60+
exclude=True),
61+
storage_client: MinIOStorageClient = Field(
62+
description="Storage client for downloading files from S3 URLs、HTTP URLs、HTTPS URLs.",
63+
default=None,
64+
exclude=True)
65+
):
66+
super().__init__()
67+
self.observer = observer
68+
self.vlm_model = vlm_model
69+
self.storage_client = storage_client
70+
# Create LoadSaveObjectManager with the storage client
71+
self.mm = LoadSaveObjectManager(storage_client=self.storage_client)
72+
73+
# Dynamically apply the load_object decorator to forward method
74+
self.forward = self.mm.load_object(input_names=["image_urls_list"])(self._forward_impl)
75+
76+
self.running_prompt_zh = "正在分析图片..."
77+
self.running_prompt_en = "Analyzing image..."
78+
79+
def _forward_impl(self, image_urls_list: List[bytes], query: str) -> List[str]:
80+
"""
81+
Analyze images identified by S3 URL, HTTP URL, or HTTPS URL and return the identified text.
82+
83+
Note: This method is wrapped by load_object decorator which downloads
84+
the image from S3 URL, HTTP URL, or HTTPS URL and passes bytes to this method.
85+
86+
Args:
87+
image_urls_list: List of image bytes converted from URLs by the decorator.
88+
The load_object decorator converts URLs to bytes before calling this method.
89+
query: User's question to guide the analysis
90+
91+
Returns:
92+
List[str]: One analysis string per image that aligns with the order
93+
of the provided images.
94+
95+
Raises:
96+
Exception: If the image cannot be downloaded or analyzed.
97+
"""
98+
# Send tool run message
99+
if self.observer:
100+
running_prompt = self.running_prompt_zh if self.observer.lang == "zh" else self.running_prompt_en
101+
self.observer.add_message("", ProcessType.TOOL, running_prompt)
102+
card_content = [{"icon": "image", "text": f"Analyzing images..."}]
103+
self.observer.add_message("", ProcessType.CARD, json.dumps(card_content, ensure_ascii=False))
104+
105+
if image_urls_list is None:
106+
raise ValueError("image_urls cannot be None")
107+
108+
if not isinstance(image_urls_list, list):
109+
raise ValueError("image_urls must be a list of bytes")
110+
111+
if not image_urls_list:
112+
raise ValueError("image_urls must contain at least one image")
113+
114+
# Load prompts from yaml file
115+
language = self.observer.lang if self.observer else "en"
116+
prompts = get_prompt_template(template_type='analyze_image', language=language)
117+
system_prompt = Template(prompts['system_prompt'], undefined=StrictUndefined).render({'query': query})
118+
119+
try:
120+
analysis_results: List[str] = []
121+
for index, image_bytes in enumerate(image_urls_list, start=1):
122+
logger.info(f"Extracting image #{index}, query: {query}")
123+
image_stream = BytesIO(image_bytes)
124+
try:
125+
response = self.vlm_model.analyze_image(
126+
image_input=image_stream,
127+
system_prompt=system_prompt
128+
)
129+
except Exception as e:
130+
raise Exception(f"Error understanding image {index}: {str(e)}")
131+
132+
analysis_results.append(response.content)
133+
134+
return analysis_results
135+
except Exception as e:
136+
logger.error(f"Error analyzing image: {str(e)}", exc_info=True)
137+
error_msg = f"Error analyzing image: {str(e)}"
138+
raise Exception(error_msg)
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import logging
2+
import os
3+
from typing import Dict, Any
4+
5+
import yaml
6+
7+
8+
logger = logging.getLogger("prompt_template_utils")
9+
10+
LANGUAGE = {
11+
"ZH": "zh",
12+
"EN": "en"
13+
}
14+
15+
# Define template path mapping
16+
template_paths = {
17+
'analyze_image': {
18+
LANGUAGE["ZH"]: 'core/prompts/analyze_image.yaml',
19+
LANGUAGE["EN"]: 'core/prompts/analyze_image_en.yaml'
20+
}
21+
}
22+
23+
def get_prompt_template(template_type: str, language: str = LANGUAGE["ZH"], **kwargs) -> Dict[str, Any]:
24+
"""
25+
Get prompt template
26+
27+
Args:
28+
template_type: Template type, supports the following values:
29+
- 'analyze_image': Analyze image template
30+
language: Language code ('zh' or 'en')
31+
**kwargs: Additional parameters, for agent type need to pass is_manager parameter
32+
33+
Returns:
34+
dict: Loaded prompt template
35+
"""
36+
logger.info(
37+
f"Getting prompt template for type: {template_type}, language: {language}, kwargs: {kwargs}")
38+
39+
if template_type not in template_paths:
40+
raise ValueError(f"Unsupported template type: {template_type}")
41+
42+
# Get template path
43+
template_path = template_paths[template_type][language]
44+
45+
# Get the directory of this file and construct absolute path
46+
current_dir = os.path.dirname(os.path.abspath(__file__))
47+
# Go up one level from utils to core, then use the template path
48+
core_dir = os.path.dirname(current_dir)
49+
absolute_template_path = os.path.join(core_dir, template_path.replace('core/', ''))
50+
51+
# Read and return template content
52+
with open(absolute_template_path, 'r', encoding='utf-8') as f:
53+
return yaml.safe_load(f)

sdk/nexent/core/utils/tools_common_message.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ class ToolSign(Enum):
1111
TAVILY_SEARCH = "d" # Tavily search tool identifier
1212
FILE_OPERATION = "f" # File operation tool identifier
1313
TERMINAL_OPERATION = "t" # Terminal operation tool identifier
14+
MULTIMODAL_OPERATION = "m" # Multimodal operation tool identifier
1415

1516

1617
# Tool sign mapping for backward compatibility
@@ -21,6 +22,7 @@ class ToolSign(Enum):
2122
"exa_search": ToolSign.EXA_SEARCH.value,
2223
"file_operation": ToolSign.FILE_OPERATION.value,
2324
"terminal_operation": ToolSign.TERMINAL_OPERATION.value,
25+
"multimodal_operation": ToolSign.MULTIMODAL_OPERATION.value,
2426
}
2527

2628
# Reverse mapping for lookup
@@ -33,6 +35,7 @@ class ToolCategory(Enum):
3335
FILE = "file"
3436
EMAIL = "email"
3537
TERMINAL = "terminal"
38+
MULTIMODAL = "multimodal"
3639

3740

3841
@dataclass

0 commit comments

Comments
 (0)