diff --git a/docs/references.md b/docs/references.md index 0769692..c49d7e8 100644 --- a/docs/references.md +++ b/docs/references.md @@ -32,6 +32,10 @@ - [azure-rest-api-specs/specification/ai/data-plane/Azure.AI.Agents](https://github.com/Azure/azure-rest-api-specs/tree/main/specification/ai/data-plane/Azure.AI.Agents) - [How to use the Deep Research tool](https://learn.microsoft.com/azure/ai-foundry/agents/how-to/tools/deep-research-samples?pivots=python) +#### Vision + +- [What is Azure AI Vision?](https://learn.microsoft.com/en-us/azure/ai-services/computer-vision/overview) + ### Services - [FastAPI](https://fastapi.tiangolo.com/) diff --git a/scripts/azure_openai_operator.py b/scripts/azure_openai_operator.py index 4e03808..b5b1055 100644 --- a/scripts/azure_openai_operator.py +++ b/scripts/azure_openai_operator.py @@ -4,8 +4,9 @@ import typer from dotenv import load_dotenv -from langchain_core.messages import HumanMessage +from langchain_core.messages import HumanMessage, SystemMessage +from template_langgraph.internals.models.image_analysis import ImageCaptioningResult, ObjectDetectionResult from template_langgraph.llms.azure_openais import AzureOpenAiWrapper from template_langgraph.loggers import get_logger @@ -259,6 +260,67 @@ def responses( logger.info(f"Output: {response.content}") +@app.command() +def image_analysis( + file_path: str = typer.Option( + "./docs/images/streamlit.png", + "--file", + "-f", + help="Path to the image file to analyze", + ), + type: str = typer.Option( + "captioning", + "--type", + "-t", + help="Type of image analysis to perform", + case_sensitive=False, + show_choices=True, + autocompletion=lambda: ["captioning", "object_detection"], + ), + verbose: bool = typer.Option( + False, + "--verbose", + "-v", + help="Enable verbose output", + ), +): + set_verbose_logging(verbose) + + result_type: ImageCaptioningResult | ObjectDetectionResult | None = None + if type == "captioning": + result_type = ImageCaptioningResult + elif type == "object_detection": + result_type = ObjectDetectionResult + else: + raise ValueError(f"Unsupported analysis type: {type}") + + llm = AzureOpenAiWrapper().chat_model.with_structured_output(result_type) + result = llm.invoke( + input=[ + SystemMessage( + content="You are a helpful assistant that performs image analysis tasks. " + "You will be provided with an image in base64 format. " + "Analyze the image and provide the required information based on the user's request." + ), + HumanMessage( + content=[ + { + "type": "text", + "text": "Analyze the following image and provide the required information.", + }, + { + "type": "image", + "source_type": "base64", + "data": load_image_to_base64(file_path), + "mime_type": "image/png", + }, + ] + ), + ], + ) + logger.info(f"Result: {result}") + + if __name__ == "__main__": load_dotenv( override=True, diff --git a/template_langgraph/internals/models/__init__.py b/template_langgraph/internals/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/template_langgraph/internals/models/image_analysis.py b/template_langgraph/internals/models/image_analysis.py new file mode 100644 index 0000000..2b7e693 --- /dev/null +++ b/template_langgraph/internals/models/image_analysis.py @@ -0,0 +1,61 @@ +import logging +from dataclasses import dataclass +from enum import Enum + +from pydantic import BaseModel, Field + +logger = logging.getLogger(__name__) + + +class ObjectTag(Enum): + Book = "Book" + Person = "Person" + Car = "Car" + Dog = "Dog" + Cat = "Cat" + # Add more tags as needed + + +# https://learn.microsoft.com/en-us/azure/ai-services/computer-vision/concept-object-detection-40 +@dataclass +class BoundingBox: + left: int + top: int + width: int + height: int + + +class ObjectDetectionResponse(BaseModel): + """ + Object Detection Response Model + """ + + name: ObjectTag = Field(description="Detected object tag") + confidence: float = Field(description="Confidence score of the detection (0 to 1)") + bounding_box: BoundingBox = Field(description="Bounding box of the detected object") + + def __str__(self) -> str: + return self.model_dump_json(indent=2) + + +class ObjectDetectionResult(BaseModel): + """ + Object Detection Result Model + """ + + objects: list[ObjectDetectionResponse] = Field(description="List of detected objects") + + def __str__(self) -> str: + return self.model_dump_json(indent=2) + + +class ImageCaptioningResult(BaseModel): + """ + Image Captioning Result Model + """ + + caption: str = Field(description="Caption of the image") + confidence: float = Field(description="Confidence score of the caption") + + def __str__(self) -> str: + return self.model_dump_json(indent=2)