Skip to content

Commit 1064d38

Browse files
committed
✨ image to text tool
1 parent c031632 commit 1064d38

File tree

7 files changed

+159
-20
lines changed

7 files changed

+159
-20
lines changed

backend/agents/create_agent_info.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@
2525
from utils.config_utils import tenant_config_manager, get_model_name_from_config
2626
from consts.const import LOCAL_MCP_SERVER, MODEL_CONFIG_MAPPING, LANGUAGE
2727

28+
from backend.database.client import minio_client
29+
from backend.services.image_service import get_vlm_model
30+
from backend.utils.prompt_template_utils import get_analyze_file_prompt_template
31+
2832
logger = logging.getLogger("create_agent_info")
2933
logger.setLevel(logging.DEBUG)
3034

@@ -236,6 +240,18 @@ async def create_tool_config_list(agent_id, tenant_id, user_id):
236240
"vdb_core": get_vector_db_core(),
237241
"embedding_model": get_embedding_model(tenant_id=tenant_id),
238242
}
243+
elif tool_config.class_name == "ImageUnderstandingTool":
244+
# Load prompts from yaml file
245+
language = 'zh'
246+
prompts = get_analyze_file_prompt_template(language)
247+
system_prompt_template = Template(prompts['image_analysis']['system_prompt'],
248+
undefined=StrictUndefined)
249+
tool_config.metadata = {
250+
"vlm_model": get_vlm_model(tenant_id=tenant_id),
251+
"storage_client": minio_client,
252+
"system_prompt_template": system_prompt_template,
253+
}
254+
239255
tool_config_list.append(tool_config)
240256

241257
return tool_config_list

backend/mcp_service.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
import logging
2+
3+
from tool_collection.mcp.blur_image_tool import local_blur_image
4+
from tool_collection.mcp.get_image_by_s3_url_tool import local_get_image_by_s3_url_tool
25
from utils.logging_utils import configure_logging
36
from fastmcp import FastMCP
47
from tool_collection.mcp.local_mcp_service import local_mcp_service
@@ -17,6 +20,8 @@
1720

1821
# mount local service (stable, not affected by remote proxy)
1922
nexent_mcp.mount(local_mcp_service.name, local_mcp_service)
23+
nexent_mcp.mount(local_blur_image.name, local_blur_image)
24+
nexent_mcp.mount(local_get_image_by_s3_url_tool.name, local_get_image_by_s3_url_tool)
2025

2126
if __name__ == "__main__":
2227
nexent_mcp.run(transport="sse", host="0.0.0.0", port=5011)

backend/services/image_service.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@
44
import aiohttp
55

66
from consts.const import DATA_PROCESS_SERVICE
7+
from nexent import MessageObserver
8+
from nexent.core.models import OpenAIVLModel
9+
10+
from backend.consts.const import MODEL_CONFIG_MAPPING
11+
from backend.utils.config_utils import tenant_config_manager, get_model_name_from_config
712

813
logger = logging.getLogger("image_service")
914

@@ -23,3 +28,19 @@ async def proxy_image_impl(decoded_url: str):
2328

2429
result = await response.json()
2530
return result
31+
32+
def get_vlm_model(tenant_id: str):
33+
# Get the tenant config
34+
vlm_model_config = tenant_config_manager.get_model_config(
35+
key=MODEL_CONFIG_MAPPING["vlm"], tenant_id=tenant_id)
36+
return OpenAIVLModel(
37+
observer=MessageObserver(),
38+
model_id=get_model_name_from_config(
39+
vlm_model_config) if vlm_model_config else "",
40+
api_base=vlm_model_config.get("base_url", ""),
41+
api_key=vlm_model_config.get("api_key", ""),
42+
temperature=0.7,
43+
top_p=0.7,
44+
frequency_penalty=0.5,
45+
max_tokens=512
46+
)

backend/services/tool_configuration_service.py

Lines changed: 3 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,9 @@
2727
from services.vectordatabase_service import get_embedding_model, get_vector_db_core
2828
from services.tenant_config_service import get_selected_knowledge_list
2929

30-
from backend.consts.const import MODEL_CONFIG_MAPPING
3130
from backend.database.client import minio_client, MinioClient
32-
from backend.utils.config_utils import tenant_config_manager, get_model_name_from_config
31+
from backend.services.image_service import get_vlm_model
3332
from backend.utils.prompt_template_utils import get_analyze_file_prompt_template
34-
from sdk.nexent import MessageObserver
35-
from sdk.nexent.core.models import OpenAIVLModel
3633

3734
logger = logging.getLogger("tool_configuration_service")
3835

@@ -621,22 +618,10 @@ def _validate_local_tool(
621618
'embedding_model': embedding_model,
622619
}
623620
tool_instance = tool_class(**params)
624-
elif tool_name == "image_text_understanding_tool":
621+
elif tool_name == "image_text_understanding":
625622
if not tenant_id or not user_id:
626623
raise ToolExecutionException(f"Tenant ID and User ID are required for {tool_name} validation")
627-
vlm_model_config = tenant_config_manager.get_model_config(
628-
key=MODEL_CONFIG_MAPPING["vlm"], tenant_id=tenant_id)
629-
image_to_text_model = OpenAIVLModel(
630-
observer=MessageObserver(),
631-
model_id=get_model_name_from_config(
632-
vlm_model_config) if vlm_model_config else "",
633-
api_base=vlm_model_config.get("base_url", ""),
634-
api_key=vlm_model_config.get("api_key", ""),
635-
temperature=0.7,
636-
top_p=0.7,
637-
frequency_penalty=0.5,
638-
max_tokens=512
639-
)
624+
image_to_text_model = get_vlm_model(tenant_id=tenant_id)
640625
# Load prompts from yaml file
641626
language = 'zh'
642627
prompts = get_analyze_file_prompt_template(language)

sdk/nexent/core/agents/nexent_agent.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,12 @@ def create_local_tool(self, tool_config: ToolConfig):
7171
vdb_core=tool_config.metadata.get("vdb_core", []),
7272
embedding_model=tool_config.metadata.get("embedding_model", []),
7373
**params)
74+
elif class_name == "ImageUnderstandingTool":
75+
tools_obj = tool_class(observer=self.observer,
76+
vlm_model=tool_config.metadata.get("vlm_model", []),
77+
storage_client=tool_config.metadata.get("storage_client", []),
78+
system_prompt_template=tool_config.metadata.get("system_prompt_template", []),
79+
**params)
7480
else:
7581
tools_obj = tool_class(**params)
7682
if hasattr(tools_obj, 'observer'):

sdk/nexent/core/tools/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from .move_item_tool import MoveItemTool
1313
from .list_directory_tool import ListDirectoryTool
1414
from .terminal_tool import TerminalTool
15-
from .image_text_understanding_tool import ImageTextUnderstandingTool
15+
from .image_understanding_tool import ImageUnderstandingTool
1616

1717
__all__ = [
1818
"ExaSearchTool",
@@ -29,5 +29,5 @@
2929
"MoveItemTool",
3030
"ListDirectoryTool",
3131
"TerminalTool",
32-
"ImageTextUnderstandingTool"
32+
"ImageUnderstandingTool"
3333
]
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
import json
2+
import logging
3+
from io import BytesIO
4+
5+
from jinja2 import Template
6+
from pydantic import Field
7+
from smolagents.tools import Tool
8+
9+
from ..models.openai_vlm import OpenAIVLModel
10+
from ..utils.observer import MessageObserver, ProcessType
11+
from ..utils.tools_common_message import ToolCategory, ToolSign
12+
from ... import MinIOStorageClient
13+
from ...multi_modal.load_save_object import LoadSaveObjectManager
14+
15+
logger = logging.getLogger("image_understanding_tool")
16+
17+
18+
class ImageUnderstandingTool(Tool):
19+
"""Tool for extracting text from images stored in S3-compatible storage."""
20+
21+
name = "image_understanding"
22+
description = (
23+
"Understand an image stored in S3-compatible storage or HTTP and return the text content inside the image. "
24+
"Provide the object location via an s3:// URL or http:// URL or https:// URL."
25+
)
26+
inputs = {
27+
"image_url": {
28+
"type": "string",
29+
"description": "URL of the image to analyze (e.g., 's3://bucket/path/to/image.png',"
30+
"'http://image.png', 'https://image.png')."
31+
},
32+
"query": {
33+
"type": "string",
34+
"description": "The user query to perform."
35+
}
36+
}
37+
output_type = "string"
38+
# todo
39+
category = ToolCategory.FILE.value
40+
tool_sign = ToolSign.FILE_OPERATION.value
41+
42+
def __init__(
43+
self,
44+
observer: MessageObserver = Field(description="Message observer", default=None, exclude=True),
45+
vlm_model: OpenAIVLModel = Field(description="The VLM model to use", default=None, exclude=True),
46+
storage_client: MinIOStorageClient = Field(description="Storage client to use", default=None, exclude=True),
47+
# todo 这么写对不对
48+
system_prompt_template: Template = Field(description="System prompt template to use", default=None, exclude=True),
49+
):
50+
super().__init__()
51+
self.observer = observer
52+
self.vlm_model = vlm_model
53+
# Use provided storage_client or create a default one
54+
# if storage_client is None:
55+
# storage_client = create_storage_client_from_config()
56+
self.storage_client = storage_client
57+
self.system_prompt_template = system_prompt_template
58+
59+
60+
# Create LoadSaveObjectManager with the storage client
61+
self.mm = LoadSaveObjectManager(storage_client=self.storage_client)
62+
63+
# Dynamically apply the load_object decorator to forward method
64+
self.forward = self.mm.load_object(input_names=["image_url"])(self._forward_impl)
65+
66+
self.running_prompt_zh = "正在分析图片文字..."
67+
self.running_prompt_en = "Analyzing image text..."
68+
69+
def _forward_impl(self, image_url: bytes, query: str) -> str:
70+
"""
71+
Analyze the image specified by the S3 URL and return recognized text.
72+
73+
Note: This method is wrapped by load_object decorator which downloads
74+
the image from S3 URL and passes bytes to this method.
75+
76+
Args:
77+
image_url: Image bytes (converted from S3 URL by decorator).
78+
79+
Returns:
80+
JSON string containing the recognized text.
81+
82+
Raises:
83+
Exception: If the image cannot be downloaded or analyzed.
84+
"""
85+
# Note: image_url is now bytes after decorator processing
86+
image_stream = BytesIO(image_url)
87+
88+
# Send tool run message
89+
if self.observer:
90+
running_prompt = self.running_prompt_zh if self.observer.lang == "zh" else self.running_prompt_en
91+
self.observer.add_message("", ProcessType.TOOL, running_prompt)
92+
card_content = [{"icon": "image", "text": "Processing image..."}]
93+
self.observer.add_message("", ProcessType.CARD, json.dumps(card_content, ensure_ascii=False))
94+
95+
# # Load messages based on language
96+
# messages = get_file_processing_messages_template(language)
97+
98+
try:
99+
text = self.vlm_model.analyze_image(
100+
image_input=image_stream,
101+
system_prompt=self.system_prompt_template.render({'query': query})).content
102+
return text
103+
# return messages["IMAGE_CONTENT_SUCCESS"].format(filename=filename, content=text)
104+
except Exception as e:
105+
raise e
106+

0 commit comments

Comments
 (0)