1+ """"
2+ Analyze Image Tool
3+
4+ Analyze images using a large language model.
5+ Supports images from S3, HTTP, and HTTPS URLs.
6+ """
7+
18import json
29import logging
310from io import BytesIO
11+ from typing import List
412
513from jinja2 import Template , StrictUndefined
614from pydantic import Field
715from smolagents .tools import Tool
816
9- from . .models . openai_vlm import OpenAIVLModel
10- from . .utils .observer import MessageObserver , ProcessType
11- from . .utils .prompt_template_utils import get_prompt_template
12- from . .utils .tools_common_message import ToolCategory , ToolSign
13- from ... import MinIOStorageClient
14- from .. .multi_modal .load_save_object import LoadSaveObjectManager
17+ from nexent . core .models import OpenAIVLModel
18+ from nexent . core .utils .observer import MessageObserver , ProcessType
19+ from nexent . core .utils .prompt_template_utils import get_prompt_template
20+ from nexent . core .utils .tools_common_message import ToolCategory , ToolSign
21+ from nexent . storage import MinIOStorageClient
22+ from nexent .multi_modal .load_save_object import LoadSaveObjectManager
1523
1624logger = logging .getLogger ("analyze_image_tool" )
1725
1826
1927class AnalyzeImageTool (Tool ):
20- """Tool for understanding and analyzing image"""
28+ """Tool for understanding and analyzing image using a visual language model """
2129
2230 name = "analyze_image"
2331 description = (
24- "This tool uses a visual language model to understand images based on your query and then returns a description of the image."
25- "It's used to understand and analyze images stored in S3 buckets, via HTTP and HTTPS."
32+ "This tool uses a visual language model to understand images based on your query and then returns a description of the image.\n "
33+ "It is used to understand and analyze multiple images, with image sources supporting S3 URLs (s3://bucket/key or /bucket/key), "
34+ "HTTP, and HTTPS URLs.\n "
2635 "Use this tool when you want to retrieve information contained in an image and provide the image's URL and your query."
2736 )
2837 inputs = {
29- "image_url" : {
30- "type" : "string" ,
31- "description" : "URL of the image to analyze (e.g., 's3://bucket/path/to/image.png',"
32- "'http://image.png', 'https://image.png')."
38+ "image_urls_list" : {
39+ "type" : "array" ,
40+ "description" : "List of image URLs (S3, HTTP, or HTTPS). Supports s3://bucket/key, /bucket/key, http://, and https:// URLs." ,
3341 },
3442 "query" : {
3543 "type" : "string" ,
36- "description" : "The user query to perform. "
44+ "description" : "User's question to guide the analysis "
3745 }
3846 }
39- output_type = "string"
40- # todo
41- category = ToolCategory .FILE .value
42- tool_sign = ToolSign .FILE_OPERATION .value
47+ output_type = "array"
48+ category = ToolCategory .MULTIMODAL .value
49+ tool_sign = ToolSign .MULTIMODAL_OPERATION .value
4350
4451 def __init__ (
4552 self ,
46- observer : MessageObserver = Field (description = "Message observer" , default = None , exclude = True ),
47- vlm_model : OpenAIVLModel = Field (description = "The VLM model to use" , default = None , exclude = True ),
48- storage_client : MinIOStorageClient = Field (description = "Storage client to use" , default = None , exclude = True ),
53+ observer : MessageObserver = Field (
54+ description = "Message observer" ,
55+ default = None ,
56+ exclude = True ),
57+ vlm_model : OpenAIVLModel = Field (
58+ description = "The VLM model to use" ,
59+ default = None ,
60+ exclude = True ),
61+ storage_client : MinIOStorageClient = Field (
62+ description = "Storage client for downloading files from S3 URLs、HTTP URLs、HTTPS URLs." ,
63+ default = None ,
64+ exclude = True )
4965 ):
5066 super ().__init__ ()
5167 self .observer = observer
@@ -55,49 +71,68 @@ def __init__(
5571 self .mm = LoadSaveObjectManager (storage_client = self .storage_client )
5672
5773 # Dynamically apply the load_object decorator to forward method
58- self .forward = self .mm .load_object (input_names = ["image_url " ])(self ._forward_impl )
74+ self .forward = self .mm .load_object (input_names = ["image_urls_list " ])(self ._forward_impl )
5975
6076 self .running_prompt_zh = "正在分析图片..."
6177 self .running_prompt_en = "Analyzing image..."
6278
63- def _forward_impl (self , image_url : bytes , query : str ) -> str :
79+ def _forward_impl (self , image_urls_list : List [ bytes ] , query : str ) -> List [ str ] :
6480 """
65- Analyze images of S3 URL, HTTP URL, or HTTPS URL and return the identified text.
81+ Analyze images identified by S3 URL, HTTP URL, or HTTPS URL and return the identified text.
6682
6783 Note: This method is wrapped by load_object decorator which downloads
6884 the image from S3 URL, HTTP URL, or HTTPS URL and passes bytes to this method.
6985
7086 Args:
71- image_url: Image bytes (converted from S3 URL, HTTP URL, or HTTPS URL by decorator).
87+ image_urls_list: List of image bytes converted from URLs by the decorator.
88+ The load_object decorator converts URLs to bytes before calling this method.
89+ query: User's question to guide the analysis
7290
7391 Returns:
74- JSON string containing the recognized text.
92+ List[str]: One analysis string per image that aligns with the order
93+ of the provided images.
7594
7695 Raises:
7796 Exception: If the image cannot be downloaded or analyzed.
7897 """
79- # Note: image_url is now bytes after decorator processing
80- image_stream = BytesIO (image_url )
81-
8298 # Send tool run message
8399 if self .observer :
84100 running_prompt = self .running_prompt_zh if self .observer .lang == "zh" else self .running_prompt_en
85101 self .observer .add_message ("" , ProcessType .TOOL , running_prompt )
86- card_content = [{"icon" : "image" , "text" : "Analyzing image ..." }]
102+ card_content = [{"icon" : "image" , "text" : f "Analyzing images ..." }]
87103 self .observer .add_message ("" , ProcessType .CARD , json .dumps (card_content , ensure_ascii = False ))
88104
105+ if image_urls_list is None :
106+ raise ValueError ("image_urls cannot be None" )
107+
108+ if not isinstance (image_urls_list , list ):
109+ raise ValueError ("image_urls must be a list of bytes" )
110+
111+ if not image_urls_list :
112+ raise ValueError ("image_urls must contain at least one image" )
113+
89114 # Load prompts from yaml file
90- prompts = get_prompt_template (template_type = 'analyze_image' , language = self .observer .lang )
115+ language = self .observer .lang if self .observer else "en"
116+ prompts = get_prompt_template (template_type = 'analyze_image' , language = language )
117+ system_prompt = Template (prompts ['system_prompt' ], undefined = StrictUndefined ).render ({'query' : query })
91118
92119 try :
93-
94- response = self .vlm_model .analyze_image (
95- image_input = image_stream ,
96- system_prompt = Template (prompts ['system_prompt' ], undefined = StrictUndefined ).render ({'query' : query }))
120+ analysis_results : List [str ] = []
121+ for index , image_bytes in enumerate (image_urls_list , start = 1 ):
122+ logger .info (f"Extracting image #{ index } , query: { query } " )
123+ image_stream = BytesIO (image_bytes )
124+ try :
125+ response = self .vlm_model .analyze_image (
126+ image_input = image_stream ,
127+ system_prompt = system_prompt
128+ )
129+ except Exception as e :
130+ raise Exception (f"Error understanding image { index } : { str (e )} " )
131+
132+ analysis_results .append (response .content )
133+
134+ return analysis_results
97135 except Exception as e :
98- raise Exception (f"Error understanding image: { str (e )} " )
99- text = response .content
100- # Record the detailed content of this search
101- # todo 返回的结构体是什么?
102- search_results_data = {'text' : text }
103- return json .dumps (search_results_data , ensure_ascii = False )
136+ logger .error (f"Error analyzing image: { str (e )} " , exc_info = True )
137+ error_msg = f"Error analyzing image: { str (e )} "
138+ raise Exception (error_msg )
0 commit comments