Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/references.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@
- [azure-rest-api-specs/specification/ai/data-plane/Azure.AI.Agents](https://github.com/Azure/azure-rest-api-specs/tree/main/specification/ai/data-plane/Azure.AI.Agents)
- [How to use the Deep Research tool](https://learn.microsoft.com/azure/ai-foundry/agents/how-to/tools/deep-research-samples?pivots=python)

#### Vision

- [What is Azure AI Vision?](https://learn.microsoft.com/en-us/azure/ai-services/computer-vision/overview)

### Services

- [FastAPI](https://fastapi.tiangolo.com/)
Expand Down
64 changes: 63 additions & 1 deletion scripts/azure_openai_operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@

import typer
from dotenv import load_dotenv
from langchain_core.messages import HumanMessage
from langchain_core.messages import HumanMessage, SystemMessage

from template_langgraph.internals.models.image_analysis import ImageCaptioningResult, ObjectDetectionResult
from template_langgraph.llms.azure_openais import AzureOpenAiWrapper
from template_langgraph.loggers import get_logger

Expand Down Expand Up @@ -259,6 +260,67 @@ def responses(
logger.info(f"Output: {response.content}")


@app.command()
def image_analysis(
file_path: str = typer.Option(
"./docs/images/streamlit.png",
"--file",
"-f",
help="Path to the image file to analyze",
),
type: str = typer.Option(
"captioning",
"--type",
"-t",
help="Type of image analysis to perform",
case_sensitive=False,
show_choices=True,
autocompletion=lambda: ["captioning", "object_detection"],
),
verbose: bool = typer.Option(
False,
"--verbose",
"-v",
help="Enable verbose output",
),
):
set_verbose_logging(verbose)

result_type: ImageCaptioningResult | ObjectDetectionResult | None = None
if type == "captioning":
result_type = ImageCaptioningResult
elif type == "object_detection":
result_type = ObjectDetectionResult
else:
raise ValueError(f"Unsupported analysis type: {type}")

llm = AzureOpenAiWrapper().chat_model.with_structured_output(result_type)
result = llm.invoke(
input=[
SystemMessage(
content="You are a helpful assistant that performs image analysis tasks. "
"You will be provided with an image in base64 format. "
"Analyze the image and provide the required information based on the user's request."
),
HumanMessage(
content=[
{
"type": "text",
"text": "Analyze the following image and provide the required information.",
},
{
"type": "image",
"source_type": "base64",
"data": load_image_to_base64(file_path),
"mime_type": "image/png",
},
]
),
],
)
logger.info(f"Result: {result}")


if __name__ == "__main__":
load_dotenv(
override=True,
Expand Down
Empty file.
61 changes: 61 additions & 0 deletions template_langgraph/internals/models/image_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import logging
from dataclasses import dataclass
from enum import Enum

from pydantic import BaseModel, Field

logger = logging.getLogger(__name__)


class ObjectTag(Enum):
Book = "Book"
Person = "Person"
Car = "Car"
Dog = "Dog"
Cat = "Cat"
# Add more tags as needed


# https://learn.microsoft.com/en-us/azure/ai-services/computer-vision/concept-object-detection-40
@dataclass
class BoundingBox:
left: int
top: int
width: int
height: int


class ObjectDetectionResponse(BaseModel):
"""
Object Detection Response Model
"""

name: ObjectTag = Field(description="Detected object tag")
confidence: float = Field(description="Confidence score of the detection (0 to 1)")
bounding_box: BoundingBox = Field(description="Bounding box of the detected object")

def __str__(self) -> str:
return self.model_dump_json(indent=2)


class ObjectDetectionResult(BaseModel):
"""
Object Detection Result Model
"""

objects: list[ObjectDetectionResponse] = Field(description="List of detected objects")

def __str__(self) -> str:
return self.model_dump_json(indent=2)


class ImageCaptioningResult(BaseModel):
"""
Image Captioning Result Model
"""

caption: str = Field(description="Caption of the image")
confidence: float = Field(description="Confidence score of the caption")

def __str__(self) -> str:
return self.model_dump_json(indent=2)
Loading