Skip to content

Commit e0caf0e

Browse files
authored
Merge pull request #169 from ks6088ts-labs/feature/issue-168_add-caption
add image analysis sub-command to Azure OpenAI Operator CLI
2 parents 70da2dd + 4a993f6 commit e0caf0e

File tree

4 files changed

+128
-1
lines changed

4 files changed

+128
-1
lines changed

docs/references.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@
3232
- [azure-rest-api-specs/specification/ai/data-plane/Azure.AI.Agents](https://github.com/Azure/azure-rest-api-specs/tree/main/specification/ai/data-plane/Azure.AI.Agents)
3333
- [How to use the Deep Research tool](https://learn.microsoft.com/azure/ai-foundry/agents/how-to/tools/deep-research-samples?pivots=python)
3434

35+
#### Vision
36+
37+
- [What is Azure AI Vision?](https://learn.microsoft.com/en-us/azure/ai-services/computer-vision/overview)
38+
3539
### Services
3640

3741
- [FastAPI](https://fastapi.tiangolo.com/)

scripts/azure_openai_operator.py

Lines changed: 63 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,9 @@
44

55
import typer
66
from dotenv import load_dotenv
7-
from langchain_core.messages import HumanMessage
7+
from langchain_core.messages import HumanMessage, SystemMessage
88

9+
from template_langgraph.internals.models.image_analysis import ImageCaptioningResult, ObjectDetectionResult
910
from template_langgraph.llms.azure_openais import AzureOpenAiWrapper
1011
from template_langgraph.loggers import get_logger
1112

@@ -259,6 +260,67 @@ def responses(
259260
logger.info(f"Output: {response.content}")
260261

261262

263+
@app.command()
264+
def image_analysis(
265+
file_path: str = typer.Option(
266+
"./docs/images/streamlit.png",
267+
"--file",
268+
"-f",
269+
help="Path to the image file to analyze",
270+
),
271+
type: str = typer.Option(
272+
"captioning",
273+
"--type",
274+
"-t",
275+
help="Type of image analysis to perform",
276+
case_sensitive=False,
277+
show_choices=True,
278+
autocompletion=lambda: ["captioning", "object_detection"],
279+
),
280+
verbose: bool = typer.Option(
281+
False,
282+
"--verbose",
283+
"-v",
284+
help="Enable verbose output",
285+
),
286+
):
287+
set_verbose_logging(verbose)
288+
289+
result_type: ImageCaptioningResult | ObjectDetectionResult | None = None
290+
if type == "captioning":
291+
result_type = ImageCaptioningResult
292+
elif type == "object_detection":
293+
result_type = ObjectDetectionResult
294+
else:
295+
raise ValueError(f"Unsupported analysis type: {type}")
296+
297+
llm = AzureOpenAiWrapper().chat_model.with_structured_output(result_type)
298+
result = llm.invoke(
299+
input=[
300+
SystemMessage(
301+
content="You are a helpful assistant that performs image analysis tasks. "
302+
"You will be provided with an image in base64 format. "
303+
"Analyze the image and provide the required information based on the user's request."
304+
),
305+
HumanMessage(
306+
content=[
307+
{
308+
"type": "text",
309+
"text": "Analyze the following image and provide the required information.",
310+
},
311+
{
312+
"type": "image",
313+
"source_type": "base64",
314+
"data": load_image_to_base64(file_path),
315+
"mime_type": "image/png",
316+
},
317+
]
318+
),
319+
],
320+
)
321+
logger.info(f"Result: {result}")
322+
323+
262324
if __name__ == "__main__":
263325
load_dotenv(
264326
override=True,

template_langgraph/internals/models/__init__.py

Whitespace-only changes.
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
import logging
2+
from dataclasses import dataclass
3+
from enum import Enum
4+
5+
from pydantic import BaseModel, Field
6+
7+
logger = logging.getLogger(__name__)
8+
9+
10+
class ObjectTag(Enum):
11+
Book = "Book"
12+
Person = "Person"
13+
Car = "Car"
14+
Dog = "Dog"
15+
Cat = "Cat"
16+
# Add more tags as needed
17+
18+
19+
# https://learn.microsoft.com/en-us/azure/ai-services/computer-vision/concept-object-detection-40
20+
@dataclass
21+
class BoundingBox:
22+
left: int
23+
top: int
24+
width: int
25+
height: int
26+
27+
28+
class ObjectDetectionResponse(BaseModel):
29+
"""
30+
Object Detection Response Model
31+
"""
32+
33+
name: ObjectTag = Field(description="Detected object tag")
34+
confidence: float = Field(description="Confidence score of the detection (0 to 1)")
35+
bounding_box: BoundingBox = Field(description="Bounding box of the detected object")
36+
37+
def __str__(self) -> str:
38+
return self.model_dump_json(indent=2)
39+
40+
41+
class ObjectDetectionResult(BaseModel):
42+
"""
43+
Object Detection Result Model
44+
"""
45+
46+
objects: list[ObjectDetectionResponse] = Field(description="List of detected objects")
47+
48+
def __str__(self) -> str:
49+
return self.model_dump_json(indent=2)
50+
51+
52+
class ImageCaptioningResult(BaseModel):
53+
"""
54+
Image Captioning Result Model
55+
"""
56+
57+
caption: str = Field(description="Caption of the image")
58+
confidence: float = Field(description="Confidence score of the caption")
59+
60+
def __str__(self) -> str:
61+
return self.model_dump_json(indent=2)

0 commit comments

Comments
 (0)