|
| 1 | +import json |
| 2 | +import logging |
| 3 | +from io import BytesIO |
| 4 | + |
| 5 | +from jinja2 import Template |
| 6 | +from pydantic import Field |
| 7 | +from smolagents.tools import Tool |
| 8 | + |
| 9 | +from ..models.openai_vlm import OpenAIVLModel |
| 10 | +from ..utils.observer import MessageObserver, ProcessType |
| 11 | +from ..utils.tools_common_message import ToolCategory, ToolSign |
| 12 | +from ... import MinIOStorageClient |
| 13 | +from ...multi_modal.load_save_object import LoadSaveObjectManager |
| 14 | + |
| 15 | +logger = logging.getLogger("image_understanding_tool") |
| 16 | + |
| 17 | + |
| 18 | +class ImageUnderstandingTool(Tool): |
| 19 | + """Tool for extracting text from images stored in S3-compatible storage.""" |
| 20 | + |
| 21 | + name = "image_understanding" |
| 22 | + description = ( |
| 23 | + "Understand an image stored in S3-compatible storage or HTTP and return the text content inside the image. " |
| 24 | + "Provide the object location via an s3:// URL or http:// URL or https:// URL." |
| 25 | + ) |
| 26 | + inputs = { |
| 27 | + "image_url": { |
| 28 | + "type": "string", |
| 29 | + "description": "URL of the image to analyze (e.g., 's3://bucket/path/to/image.png'," |
| 30 | + "'http://image.png', 'https://image.png')." |
| 31 | + }, |
| 32 | + "query": { |
| 33 | + "type": "string", |
| 34 | + "description": "The user query to perform." |
| 35 | + } |
| 36 | + } |
| 37 | + output_type = "string" |
| 38 | + # todo |
| 39 | + category = ToolCategory.FILE.value |
| 40 | + tool_sign = ToolSign.FILE_OPERATION.value |
| 41 | + |
| 42 | + def __init__( |
| 43 | + self, |
| 44 | + observer: MessageObserver = Field(description="Message observer", default=None, exclude=True), |
| 45 | + vlm_model: OpenAIVLModel = Field(description="The VLM model to use", default=None, exclude=True), |
| 46 | + storage_client: MinIOStorageClient = Field(description="Storage client to use", default=None, exclude=True), |
| 47 | + # todo 这么写对不对 |
| 48 | + system_prompt_template: Template = Field(description="System prompt template to use", default=None, exclude=True), |
| 49 | + ): |
| 50 | + super().__init__() |
| 51 | + self.observer = observer |
| 52 | + self.vlm_model = vlm_model |
| 53 | + # Use provided storage_client or create a default one |
| 54 | + # if storage_client is None: |
| 55 | + # storage_client = create_storage_client_from_config() |
| 56 | + self.storage_client = storage_client |
| 57 | + self.system_prompt_template = system_prompt_template |
| 58 | + |
| 59 | + |
| 60 | + # Create LoadSaveObjectManager with the storage client |
| 61 | + self.mm = LoadSaveObjectManager(storage_client=self.storage_client) |
| 62 | + |
| 63 | + # Dynamically apply the load_object decorator to forward method |
| 64 | + self.forward = self.mm.load_object(input_names=["image_url"])(self._forward_impl) |
| 65 | + |
| 66 | + self.running_prompt_zh = "正在分析图片文字..." |
| 67 | + self.running_prompt_en = "Analyzing image text..." |
| 68 | + |
| 69 | + def _forward_impl(self, image_url: bytes, query: str) -> str: |
| 70 | + """ |
| 71 | + Analyze the image specified by the S3 URL and return recognized text. |
| 72 | + |
| 73 | + Note: This method is wrapped by load_object decorator which downloads |
| 74 | + the image from S3 URL and passes bytes to this method. |
| 75 | +
|
| 76 | + Args: |
| 77 | + image_url: Image bytes (converted from S3 URL by decorator). |
| 78 | +
|
| 79 | + Returns: |
| 80 | + JSON string containing the recognized text. |
| 81 | +
|
| 82 | + Raises: |
| 83 | + Exception: If the image cannot be downloaded or analyzed. |
| 84 | + """ |
| 85 | + # Note: image_url is now bytes after decorator processing |
| 86 | + image_stream = BytesIO(image_url) |
| 87 | + |
| 88 | + # Send tool run message |
| 89 | + if self.observer: |
| 90 | + running_prompt = self.running_prompt_zh if self.observer.lang == "zh" else self.running_prompt_en |
| 91 | + self.observer.add_message("", ProcessType.TOOL, running_prompt) |
| 92 | + card_content = [{"icon": "image", "text": "Processing image..."}] |
| 93 | + self.observer.add_message("", ProcessType.CARD, json.dumps(card_content, ensure_ascii=False)) |
| 94 | + |
| 95 | + # # Load messages based on language |
| 96 | + # messages = get_file_processing_messages_template(language) |
| 97 | + |
| 98 | + try: |
| 99 | + text = self.vlm_model.analyze_image( |
| 100 | + image_input=image_stream, |
| 101 | + system_prompt=self.system_prompt_template.render({'query': query})).content |
| 102 | + return text |
| 103 | + # return messages["IMAGE_CONTENT_SUCCESS"].format(filename=filename, content=text) |
| 104 | + except Exception as e: |
| 105 | + raise e |
| 106 | + |
0 commit comments