1+ """"
2+ Analyze Image Tool
3+
4+ Analyze images using a large language model.
5+ Supports images from S3, HTTP, and HTTPS URLs.
6+ """
7+
18import json
29import logging
310from io import BytesIO
11+ from typing import List , Union
412
513from jinja2 import Template , StrictUndefined
614from pydantic import Field
715from smolagents .tools import Tool
816
9- from . .models . openai_vlm import OpenAIVLModel
10- from . .utils .observer import MessageObserver , ProcessType
11- from . .utils .prompt_template_utils import get_prompt_template
12- from . .utils .tools_common_message import ToolCategory , ToolSign
13- from ... import MinIOStorageClient
14- from .. .multi_modal .load_save_object import LoadSaveObjectManager
17+ from nexent . core .models import OpenAIVLModel
18+ from nexent . core .utils .observer import MessageObserver , ProcessType
19+ from nexent . core .utils .prompt_template_utils import get_prompt_template
20+ from nexent . core .utils .tools_common_message import ToolCategory , ToolSign
21+ from nexent . storage import MinIOStorageClient
22+ from nexent .multi_modal .load_save_object import LoadSaveObjectManager
1523
1624logger = logging .getLogger ("analyze_image_tool" )
1725
1826
1927class AnalyzeImageTool (Tool ):
20- """Tool for understanding and analyzing image"""
28+ """Tool for understanding and analyzing image using a visual language model """
2129
2230 name = "analyze_image"
2331 description = (
24- "This tool uses a visual language model to understand images based on your query and then returns a description of the image."
25- "It's used to understand and analyze images stored in S3 buckets, via HTTP and HTTPS."
32+ "This tool uses a visual language model to understand images based on your query and then returns a description of the image.\n "
33+ "It is used to understand and analyze multiple images, with image sources supporting S3 URLs (s3://bucket/key or /bucket/key), "
34+ "HTTP, and HTTPS URLs.\n "
2635 "Use this tool when you want to retrieve information contained in an image and provide the image's URL and your query."
2736 )
2837 inputs = {
29- "image_url " : {
30- "type" : "string " ,
31- "description" : "URL of the image to analyze (e.g., ' s3://bucket/path/to/image.png', "
32- "'http://image.png', 'https:// image.png')."
38+ "image_urls_list " : {
39+ "type" : "array " ,
40+ "description" : "List of image URLs (S3, HTTP, or HTTPS). Supports s3://bucket/key, /bucket/key, http://, and https:// URLs. "
41+ "Can also accept a single image URL which will be treated as a list with one element." ,
3342 },
3443 "query" : {
3544 "type" : "string" ,
36- "description" : "The user query to perform. "
45+ "description" : "User's question to guide the analysis "
3746 }
3847 }
3948 output_type = "string"
40- # todo
4149 category = ToolCategory .FILE .value
4250 tool_sign = ToolSign .FILE_OPERATION .value
4351
4452 def __init__ (
4553 self ,
46- observer : MessageObserver = Field (description = "Message observer" , default = None , exclude = True ),
47- vlm_model : OpenAIVLModel = Field (description = "The VLM model to use" , default = None , exclude = True ),
48- storage_client : MinIOStorageClient = Field (description = "Storage client to use" , default = None , exclude = True ),
54+ observer : MessageObserver = Field (
55+ description = "Message observer" ,
56+ default = None ,
57+ exclude = True ),
58+ vlm_model : OpenAIVLModel = Field (
59+ description = "The VLM model to use" ,
60+ default = None ,
61+ exclude = True ),
62+ storage_client : MinIOStorageClient = Field (
63+ description = "Storage client for downloading files from S3 URLs、HTTP URLs、HTTPS URLs." ,
64+ default = None ,
65+ exclude = True )
4966 ):
5067 super ().__init__ ()
5168 self .observer = observer
@@ -55,49 +72,74 @@ def __init__(
5572 self .mm = LoadSaveObjectManager (storage_client = self .storage_client )
5673
5774 # Dynamically apply the load_object decorator to forward method
58- self .forward = self .mm .load_object (input_names = ["image_url " ])(self ._forward_impl )
75+ self .forward = self .mm .load_object (input_names = ["image_urls_list " ])(self ._forward_impl )
5976
6077 self .running_prompt_zh = "正在分析图片..."
6178 self .running_prompt_en = "Analyzing image..."
6279
63- def _forward_impl (self , image_url : bytes , query : str ) -> str :
80+ def _forward_impl (self , image_urls_list : Union [ bytes , List [ bytes ]], query : str ) -> Union [ str , List [ str ]] :
6481 """
6582 Analyze images of S3 URL, HTTP URL, or HTTPS URL and return the identified text.
6683
6784 Note: This method is wrapped by load_object decorator which downloads
6885 the image from S3 URL, HTTP URL, or HTTPS URL and passes bytes to this method.
6986
7087 Args:
71- image_url: Image bytes (converted from S3 URL, HTTP URL, or HTTPS URL by decorator).
88+ image_urls_list: image bytes or a sequence of image bytes (converted from URLs by the decorator).
89+ The load_object decorator converts URLs to bytes before calling this method.
90+ query: User's question to guide the analysis
7291
7392 Returns:
74- JSON string containing the recognized text.
93+ Union[str, List[str]]: Single analysis string for one image or a list
94+ of analysis strings that align with the order of the provided images.
7595
7696 Raises:
7797 Exception: If the image cannot be downloaded or analyzed.
7898 """
79- # Note: image_url is now bytes after decorator processing
80- image_stream = BytesIO (image_url )
99+ if image_urls_list is None :
100+ raise ValueError ("image_urls cannot be None" )
101+
102+ if isinstance (image_urls_list , (list , tuple )):
103+ image_urls_list : List [bytes ] = list (image_urls_list )
104+ elif isinstance (image_urls_list , bytes ):
105+ image_urls_list = [image_urls_list ]
106+ else :
107+ raise ValueError ("image_urls must be bytes or a list/tuple of bytes" )
108+
109+ if len (image_urls_list ) == 0 :
110+ raise ValueError ("image_urls must contain at least one image" )
81111
82112 # Send tool run message
83113 if self .observer :
84114 running_prompt = self .running_prompt_zh if self .observer .lang == "zh" else self .running_prompt_en
85115 self .observer .add_message ("" , ProcessType .TOOL , running_prompt )
86- card_content = [{"icon" : "image" , "text" : "Analyzing image ..." }]
116+ card_content = [{"icon" : "image" , "text" : f "Analyzing images ..." }]
87117 self .observer .add_message ("" , ProcessType .CARD , json .dumps (card_content , ensure_ascii = False ))
88118
89119 # Load prompts from yaml file
90- prompts = get_prompt_template (template_type = 'analyze_image' , language = self .observer .lang )
120+ language = self .observer .lang if self .observer else "en"
121+ prompts = get_prompt_template (template_type = 'analyze_image' , language = language )
122+ system_prompt = Template (prompts ['system_prompt' ], undefined = StrictUndefined ).render ({'query' : query })
91123
92124 try :
93-
94- response = self .vlm_model .analyze_image (
95- image_input = image_stream ,
96- system_prompt = Template (prompts ['system_prompt' ], undefined = StrictUndefined ).render ({'query' : query }))
125+ analysis_results : List [str ] = []
126+ for index , image_bytes in enumerate (image_urls_list , start = 1 ):
127+ logger .info (f"Extracting image #{ index } , query: { query } " )
128+ image_stream = BytesIO (image_bytes )
129+ try :
130+ response = self .vlm_model .analyze_image (
131+ image_input = image_stream ,
132+ system_prompt = system_prompt
133+ )
134+ except Exception as e :
135+ raise Exception (f"Error understanding image { index } : { str (e )} " )
136+
137+ analysis_results .append (response .content )
138+
139+ if len (analysis_results ) == 1 :
140+ return analysis_results [0 ]
141+ return analysis_results
97142 except Exception as e :
98- raise Exception (f"Error understanding image: { str (e )} " )
99- text = response .content
100- # Record the detailed content of this search
101- # todo 返回的结构体是什么?
102- search_results_data = {'text' : text }
103- return json .dumps (search_results_data , ensure_ascii = False )
143+ logger .error (f"Error analyzing image: { str (e )} " , exc_info = True )
144+ error_msg = f"Error analyzing image: { str (e )} "
145+ raise Exception (error_msg )
0 commit comments