1+ """"
2+ Analyze Image Tool
3+
4+ Analyze images using a large language model.
5+ Supports images from S3, HTTP, and HTTPS URLs.
6+ """
7+
8+ import json
9+ import logging
10+ from io import BytesIO
11+ from typing import List
12+
13+ from jinja2 import Template , StrictUndefined
14+ from pydantic import Field
15+ from smolagents .tools import Tool
16+
17+ from nexent .core .models import OpenAIVLModel
18+ from nexent .core .utils .observer import MessageObserver , ProcessType
19+ from nexent .core .utils .prompt_template_utils import get_prompt_template
20+ from nexent .core .utils .tools_common_message import ToolCategory , ToolSign
21+ from nexent .storage import MinIOStorageClient
22+ from nexent .multi_modal .load_save_object import LoadSaveObjectManager
23+
24+ logger = logging .getLogger ("analyze_image_tool" )
25+
26+
27+ class AnalyzeImageTool (Tool ):
28+ """Tool for understanding and analyzing image using a visual language model"""
29+
30+ name = "analyze_image"
31+ description = (
32+ "This tool uses a visual language model to understand images based on your query and then returns a description of the image.\n "
33+ "It is used to understand and analyze multiple images, with image sources supporting S3 URLs (s3://bucket/key or /bucket/key), "
34+ "HTTP, and HTTPS URLs.\n "
35+ "Use this tool when you want to retrieve information contained in an image and provide the image's URL and your query."
36+ )
37+ inputs = {
38+ "image_urls_list" : {
39+ "type" : "array" ,
40+ "description" : "List of image URLs (S3, HTTP, or HTTPS). Supports s3://bucket/key, /bucket/key, http://, and https:// URLs." ,
41+ },
42+ "query" : {
43+ "type" : "string" ,
44+ "description" : "User's question to guide the analysis"
45+ }
46+ }
47+ output_type = "array"
48+ category = ToolCategory .MULTIMODAL .value
49+ tool_sign = ToolSign .MULTIMODAL_OPERATION .value
50+
51+ def __init__ (
52+ self ,
53+ observer : MessageObserver = Field (
54+ description = "Message observer" ,
55+ default = None ,
56+ exclude = True ),
57+ vlm_model : OpenAIVLModel = Field (
58+ description = "The VLM model to use" ,
59+ default = None ,
60+ exclude = True ),
61+ storage_client : MinIOStorageClient = Field (
62+ description = "Storage client for downloading files from S3 URLs、HTTP URLs、HTTPS URLs." ,
63+ default = None ,
64+ exclude = True )
65+ ):
66+ super ().__init__ ()
67+ self .observer = observer
68+ self .vlm_model = vlm_model
69+ self .storage_client = storage_client
70+ # Create LoadSaveObjectManager with the storage client
71+ self .mm = LoadSaveObjectManager (storage_client = self .storage_client )
72+
73+ # Dynamically apply the load_object decorator to forward method
74+ self .forward = self .mm .load_object (input_names = ["image_urls_list" ])(self ._forward_impl )
75+
76+ self .running_prompt_zh = "正在分析图片..."
77+ self .running_prompt_en = "Analyzing image..."
78+
79+ def _forward_impl (self , image_urls_list : List [bytes ], query : str ) -> List [str ]:
80+ """
81+ Analyze images identified by S3 URL, HTTP URL, or HTTPS URL and return the identified text.
82+
83+ Note: This method is wrapped by load_object decorator which downloads
84+ the image from S3 URL, HTTP URL, or HTTPS URL and passes bytes to this method.
85+
86+ Args:
87+ image_urls_list: List of image bytes converted from URLs by the decorator.
88+ The load_object decorator converts URLs to bytes before calling this method.
89+ query: User's question to guide the analysis
90+
91+ Returns:
92+ List[str]: One analysis string per image that aligns with the order
93+ of the provided images.
94+
95+ Raises:
96+ Exception: If the image cannot be downloaded or analyzed.
97+ """
98+ # Send tool run message
99+ if self .observer :
100+ running_prompt = self .running_prompt_zh if self .observer .lang == "zh" else self .running_prompt_en
101+ self .observer .add_message ("" , ProcessType .TOOL , running_prompt )
102+ card_content = [{"icon" : "image" , "text" : f"Analyzing images..." }]
103+ self .observer .add_message ("" , ProcessType .CARD , json .dumps (card_content , ensure_ascii = False ))
104+
105+ if image_urls_list is None :
106+ raise ValueError ("image_urls cannot be None" )
107+
108+ if not isinstance (image_urls_list , list ):
109+ raise ValueError ("image_urls must be a list of bytes" )
110+
111+ if not image_urls_list :
112+ raise ValueError ("image_urls must contain at least one image" )
113+
114+ # Load prompts from yaml file
115+ language = self .observer .lang if self .observer else "en"
116+ prompts = get_prompt_template (template_type = 'analyze_image' , language = language )
117+ system_prompt = Template (prompts ['system_prompt' ], undefined = StrictUndefined ).render ({'query' : query })
118+
119+ try :
120+ analysis_results : List [str ] = []
121+ for index , image_bytes in enumerate (image_urls_list , start = 1 ):
122+ logger .info (f"Extracting image #{ index } , query: { query } " )
123+ image_stream = BytesIO (image_bytes )
124+ try :
125+ response = self .vlm_model .analyze_image (
126+ image_input = image_stream ,
127+ system_prompt = system_prompt
128+ )
129+ except Exception as e :
130+ raise Exception (f"Error understanding image { index } : { str (e )} " )
131+
132+ analysis_results .append (response .content )
133+
134+ return analysis_results
135+ except Exception as e :
136+ logger .error (f"Error analyzing image: { str (e )} " , exc_info = True )
137+ error_msg = f"Error analyzing image: { str (e )} "
138+ raise Exception (error_msg )
0 commit comments