Skip to content

Commit 906ca05

Browse files
committed
✨ image to text tool
1 parent 1064d38 commit 906ca05

File tree

7 files changed

+48
-34
lines changed

7 files changed

+48
-34
lines changed

backend/agents/create_agent_info.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -241,15 +241,9 @@ async def create_tool_config_list(agent_id, tenant_id, user_id):
241241
"embedding_model": get_embedding_model(tenant_id=tenant_id),
242242
}
243243
elif tool_config.class_name == "ImageUnderstandingTool":
244-
# Load prompts from yaml file
245-
language = 'zh'
246-
prompts = get_analyze_file_prompt_template(language)
247-
system_prompt_template = Template(prompts['image_analysis']['system_prompt'],
248-
undefined=StrictUndefined)
249244
tool_config.metadata = {
250245
"vlm_model": get_vlm_model(tenant_id=tenant_id),
251246
"storage_client": minio_client,
252-
"system_prompt_template": system_prompt_template,
253247
}
254248

255249
tool_config_list.append(tool_config)

backend/services/tool_configuration_service.py

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -618,21 +618,14 @@ def _validate_local_tool(
618618
'embedding_model': embedding_model,
619619
}
620620
tool_instance = tool_class(**params)
621-
elif tool_name == "image_text_understanding":
621+
elif tool_name == "image_understanding":
622622
if not tenant_id or not user_id:
623623
raise ToolExecutionException(f"Tenant ID and User ID are required for {tool_name} validation")
624624
image_to_text_model = get_vlm_model(tenant_id=tenant_id)
625-
# Load prompts from yaml file
626-
language = 'zh'
627-
prompts = get_analyze_file_prompt_template(language)
628-
system_prompt_template = Template(prompts['image_analysis']['system_prompt'],
629-
undefined=StrictUndefined)
630-
631625
params = {
632626
**instantiation_params,
633627
'vlm_model': image_to_text_model,
634-
'storage_client': minio_client,
635-
'system_prompt_template': system_prompt_template
628+
'storage_client': minio_client
636629
}
637630
tool_instance = tool_class(**params)
638631
else:

sdk/nexent/core/agents/nexent_agent.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,6 @@ def create_local_tool(self, tool_config: ToolConfig):
7575
tools_obj = tool_class(observer=self.observer,
7676
vlm_model=tool_config.metadata.get("vlm_model", []),
7777
storage_client=tool_config.metadata.get("storage_client", []),
78-
system_prompt_template=tool_config.metadata.get("system_prompt_template", []),
7978
**params)
8079
else:
8180
tools_obj = tool_class(**params)
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# 图片分析 Prompt 模板
2+
# 用于图片分析
3+
4+
system_prompt: |-
5+
用户提出了一个问题:{{ query }},请从回答这个问题的角度精简、仔细描述一下这个图片,200字以内。
6+
7+
**图片分析要求:**
8+
1. 重点关注与用户问题相关的图片内容
9+
2. 描述要精简明了,突出关键信息
10+
3. 避免无关细节,专注于能帮助回答问题的内容
11+
4. 保持客观描述,不要过度解读
12+
13+
user_prompt: |
14+
请仔细观察这张图片,并从回答用户问题的角度进行描述。
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Image Understanding Prompt Templates
2+
3+
system_prompt: |-
4+
The user has asked a question: {{ query }}. Please provide a concise and careful description of this image from the perspective of answering this question, within 200 words.
5+
6+
**Image Analysis Requirements:**
7+
1. Focus on image content relevant to the user's question
8+
2. Keep descriptions concise and clear, highlighting key information
9+
3. Avoid irrelevant details, focus on content that helps answer the question
10+
4. Maintain objective description, avoid over-interpretation
11+
12+
user_prompt: |
13+
Please carefully observe this image and describe it from the perspective of answering the user's question.

sdk/nexent/core/tools/image_understanding_tool.py

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,13 @@
22
import logging
33
from io import BytesIO
44

5-
from jinja2 import Template
5+
from jinja2 import Template, StrictUndefined
66
from pydantic import Field
77
from smolagents.tools import Tool
88

99
from ..models.openai_vlm import OpenAIVLModel
1010
from ..utils.observer import MessageObserver, ProcessType
11+
from ..utils.prompt_template_utils import get_prompt_template
1112
from ..utils.tools_common_message import ToolCategory, ToolSign
1213
from ... import MinIOStorageClient
1314
from ...multi_modal.load_save_object import LoadSaveObjectManager
@@ -50,21 +51,16 @@ def __init__(
5051
super().__init__()
5152
self.observer = observer
5253
self.vlm_model = vlm_model
53-
# Use provided storage_client or create a default one
54-
# if storage_client is None:
55-
# storage_client = create_storage_client_from_config()
5654
self.storage_client = storage_client
5755
self.system_prompt_template = system_prompt_template
58-
59-
6056
# Create LoadSaveObjectManager with the storage client
6157
self.mm = LoadSaveObjectManager(storage_client=self.storage_client)
6258

6359
# Dynamically apply the load_object decorator to forward method
6460
self.forward = self.mm.load_object(input_names=["image_url"])(self._forward_impl)
6561

66-
self.running_prompt_zh = "正在分析图片文字..."
67-
self.running_prompt_en = "Analyzing image text..."
62+
self.running_prompt_zh = "正在理解图片..."
63+
self.running_prompt_en = "Understanding image..."
6864

6965
def _forward_impl(self, image_url: bytes, query: str) -> str:
7066
"""
@@ -92,15 +88,20 @@ def _forward_impl(self, image_url: bytes, query: str) -> str:
9288
card_content = [{"icon": "image", "text": "Processing image..."}]
9389
self.observer.add_message("", ProcessType.CARD, json.dumps(card_content, ensure_ascii=False))
9490

95-
# # Load messages based on language
96-
# messages = get_file_processing_messages_template(language)
91+
# Load prompts from yaml file
92+
prompts = get_prompt_template(template_type='understand_image',language = self.observer.lang)
9793

9894
try:
99-
text = self.vlm_model.analyze_image(
95+
96+
response = self.vlm_model.analyze_image(
10097
image_input=image_stream,
101-
system_prompt=self.system_prompt_template.render({'query': query})).content
102-
return text
103-
# return messages["IMAGE_CONTENT_SUCCESS"].format(filename=filename, content=text)
98+
system_prompt=Template(prompts['system_prompt'],undefined=StrictUndefined).render({'query': query}))
10499
except Exception as e:
105-
raise e
106-
100+
raise Exception(f"Error understanding image: {str(e)}")
101+
text = response.content
102+
# Record the detailed content of this search
103+
search_results_data = {'text':text}
104+
if self.observer:
105+
search_results_data = json.dumps(search_results_data, ensure_ascii=False)
106+
self.observer.add_message("", ProcessType.SEARCH_CONTENT, search_results_data)
107+
return json.dumps(search_results_data, ensure_ascii=False)

sdk/nexent/multi_modal/load_save_object.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ def _upload_bytes_to_minio(
8787
self,
8888
bytes_data: bytes,
8989
object_name: Optional[str] = None,
90-
bucket: str = "multi-modal",
90+
bucket: str = "nexent",
9191
content_type: str = "application/octet-stream",
9292
) -> str:
9393
"""
@@ -194,7 +194,7 @@ def save_object(
194194
self,
195195
output_names: List[str],
196196
output_transformers: Optional[List[Callable[[Any], bytes]]] = None,
197-
bucket: str = "multi-modal",
197+
bucket: str = "nexent",
198198
):
199199
"""
200200
Decorator factory that uploads outputs to storage after function execution.

0 commit comments

Comments
 (0)