Merge pull request #169 from ks6088ts-labs/feature/issue-168_add-caption

ks6088ts · web-flow · commit e0caf0ed9da6 · 2025-10-04T07:18:59.000+09:00
add image analysis sub-command to Azure OpenAI Operator CLI
diff --git a/docs/references.md b/docs/references.md
@@ -32,6 +32,10 @@
 - [azure-rest-api-specs/specification/ai/data-plane/Azure.AI.Agents](https://github.com/Azure/azure-rest-api-specs/tree/main/specification/ai/data-plane/Azure.AI.Agents)
 - [How to use the Deep Research tool](https://learn.microsoft.com/azure/ai-foundry/agents/how-to/tools/deep-research-samples?pivots=python)
 
+#### Vision
+
+- [What is Azure AI Vision?](https://learn.microsoft.com/en-us/azure/ai-services/computer-vision/overview)
+
 ### Services
 
 - [FastAPI](https://fastapi.tiangolo.com/)
diff --git a/scripts/azure_openai_operator.py b/scripts/azure_openai_operator.py
@@ -4,8 +4,9 @@
 
 import typer
 from dotenv import load_dotenv
-from langchain_core.messages import HumanMessage
+from langchain_core.messages import HumanMessage, SystemMessage
 
+from template_langgraph.internals.models.image_analysis import ImageCaptioningResult, ObjectDetectionResult
 from template_langgraph.llms.azure_openais import AzureOpenAiWrapper
 from template_langgraph.loggers import get_logger
 
@@ -259,6 +260,67 @@ def responses(
         logger.info(f"Output: {response.content}")
 
 
+@app.command()
+def image_analysis(
+    file_path: str = typer.Option(
+        "./docs/images/streamlit.png",
+        "--file",
+        "-f",
+        help="Path to the image file to analyze",
+    ),
+    type: str = typer.Option(
+        "captioning",
+        "--type",
+        "-t",
+        help="Type of image analysis to perform",
+        case_sensitive=False,
+        show_choices=True,
+        autocompletion=lambda: ["captioning", "object_detection"],
+    ),
+    verbose: bool = typer.Option(
+        False,
+        "--verbose",
+        "-v",
+        help="Enable verbose output",
+    ),
+):
+    set_verbose_logging(verbose)
+
+    result_type: ImageCaptioningResult | ObjectDetectionResult | None = None
+    if type == "captioning":
+        result_type = ImageCaptioningResult
+    elif type == "object_detection":
+        result_type = ObjectDetectionResult
+    else:
+        raise ValueError(f"Unsupported analysis type: {type}")
+
+    llm = AzureOpenAiWrapper().chat_model.with_structured_output(result_type)
+    result = llm.invoke(
+        input=[
+            SystemMessage(
+                content="You are a helpful assistant that performs image analysis tasks. "
+                "You will be provided with an image in base64 format. "
+                "Analyze the image and provide the required information based on the user's request."
+            ),
+            HumanMessage(
+                content=[
+                    {
+                        "type": "text",
+                        "text": "Analyze the following image and provide the required information.",
+                    },
+                    {
+                        "type": "image",
+                        "source_type": "base64",
+                        "data": load_image_to_base64(file_path),
+                        "mime_type": "image/png",
+                    },
+                ]
+            ),
+        ],
+    )
+    logger.info(f"Result: {result}")
+
+
 if __name__ == "__main__":
     load_dotenv(
         override=True,
diff --git a/template_langgraph/internals/models/__init__.py b/template_langgraph/internals/models/__init__.py
diff --git a/template_langgraph/internals/models/image_analysis.py b/template_langgraph/internals/models/image_analysis.py
@@ -0,0 +1,61 @@
+import logging
+from dataclasses import dataclass
+from enum import Enum
+
+from pydantic import BaseModel, Field
+
+logger = logging.getLogger(__name__)
+
+
+class ObjectTag(Enum):
+    Book = "Book"
+    Person = "Person"
+    Car = "Car"
+    Dog = "Dog"
+    Cat = "Cat"
+    # Add more tags as needed
+
+
+# https://learn.microsoft.com/en-us/azure/ai-services/computer-vision/concept-object-detection-40
+@dataclass
+class BoundingBox:
+    left: int
+    top: int
+    width: int
+    height: int
+
+
+class ObjectDetectionResponse(BaseModel):
+    """
+    Object Detection Response Model
+    """
+
+    name: ObjectTag = Field(description="Detected object tag")
+    confidence: float = Field(description="Confidence score of the detection (0 to 1)")
+    bounding_box: BoundingBox = Field(description="Bounding box of the detected object")
+
+    def __str__(self) -> str:
+        return self.model_dump_json(indent=2)
+
+
+class ObjectDetectionResult(BaseModel):
+    """
+    Object Detection Result Model
+    """
+
+    objects: list[ObjectDetectionResponse] = Field(description="List of detected objects")
+
+    def __str__(self) -> str:
+        return self.model_dump_json(indent=2)
+
+
+class ImageCaptioningResult(BaseModel):
+    """
+    Image Captioning Result Model
+    """
+
+    caption: str = Field(description="Caption of the image")
+    confidence: float = Field(description="Confidence score of the caption")
+
+    def __str__(self) -> str:
+        return self.model_dump_json(indent=2)